Source code for FIBbootstrap.skeleton

# Copyright 2016 Joshua Taillon
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import glob
import numpy as np
from .utils import calculate_errors
import pandas as pd
from datetime import datetime as dt
from tqdm import tqdm

__all__ = ['bootstrap_skel_stats',
           'find_nodes']


# noinspection PyTypeChecker
[docs]def find_nodes(input_fname,
               node_output='nodes.txt',
               save_output=True,
               return_type='str'):
    """
    Calculate and write a data file describing the connectivity of nodes
    within a network (saved in an .mv3d file from Avizo)

    Parameters
    ----------
    input_fname: str
        .mv3d filename to read
    node_output: str
        name of text file to write to, if saving the output
    save_output: bool
        switch to control whether a file is written to disk (output format
        that matches the old
        :download:`findNodes.sh <../old_bash/findNodes.sh>` script)
    return_type: str
        passed to :py:meth:`numpy.ndarray.astype` to determine how the output
        should be formatted upon return. Default is as a string,
        so no information contained in the original .mv3d file is modified,
        but oftentimes ``'float32'`` would be more useful for calculating
        statistics

    Returns
    -------
    data: :class:`~numpy.ndarray`
        Contains the output data in a numpy array with columns of:

        +-----------+----------+----------+----------+---------+
        |:math:`k_i`|x position|y position|z position|thickness|
        +-----------+----------+----------+----------+---------+

    num_edges:  :class:`float`
        Number of edges (E) in skeleton
    num_nodes: :class:`float`
        Number of nodes (N) in skeleton
    mean_k: :class:`float`
        Average node connectivity (<k>)
    """

    with open(input_fname, "r") as i_file:
        lines = i_file.readlines()[7:]  # read data into list, removing header

    # Get the directory name:
    dir_name = os.path.abspath(os.path.join(os.path.abspath(input_fname),
                                            '..'))
    # remove empty lines:
    while '\n' in lines:
        lines.remove('\n')

    # replace tabs with spaces and trim newline characters
    for i, l in enumerate(lines):
        lines[i] = l.replace('\n', '').split('\t')

    # Convert to a numpy array, rather than just a list
    data = np.array(lines)

    # Find unique lines in first column (number of edges)
    num_edges = len(set(data[:, 0]))

    # Get first column, which is the index (number) of each edge
    edgenums = data[:, 0]

    # Determine where the end points of each edge are:
    end_idx = np.zeros(1, dtype='int')  # always one at zero

    for i in np.where(np.diff(edgenums.astype('int')) == 1)[0]:
        end_idx = np.append(end_idx, i)  # find where each edge ends, and add
        end_idx = np.append(end_idx, i + 1)  # position and next start to list

    end_idx = np.append(end_idx, len(edgenums) - 1)  # add the last point, too

    # end_points = data[end_idx] is now a list of edge end points
    end_points = data[end_idx]

    # join each row of the end points list into a single string
    string_list = [' '.join(row) for row in end_points[:, 1:]]

    # count how many times each end point shows up
    edges, counts = np.unique(string_list, return_counts=True)

    # put the counts before each unique edge, transpose it, then sort (and
    # reverse) the list, so the output matches the previous findNodes.sh script
    # noinspection PyUnusedLocal
    results = sorted([' '.join(s) for
                      s in np.asarray((counts, edges)).T])[::-1]

    # Create array of all the data, with counts at the beginning
    data = np.c_[counts, np.array([i.split(' ') for i in edges])]

    # sort by counts column
    sort_idx = np.lexsort((data[:, 0],))[::-1]
    data = data[sort_idx]

    # Find number of nodes by taking the length of the data
    num_nodes = len(data)

    # Find average node connectivity
    mean_k = data.astype('float32')[:, 0].mean()

    if save_output:
        np.savetxt(node_output,
                   data,
                   fmt='%s',
                   header="Python findNodes run at {0}\n\n"
                          "List of nodes in input skeleton \"{1}\"\n"
                          " from working directory \"{2}\"\n"
                          "\n"
                          "{3} total edges in skeleton\n"
                          "{4} total nodes in skeleton\n"
                          "\n"
                          "of\n"
                          "edges x\t\t\ty\t\t\t\tz\t\t\tThickness\n"
                          "----------------------------------------------"
                          "---------".format(dt.strftime(dt.now(),
                                                         "%A - %b %d, "
                                                         "%Y - %I:%M:%S %p"),
                                             os.path.basename(input_fname),
                                             dir_name,
                                             num_edges,
                                             num_nodes),
                   delimiter='\t',
                   comments='## ')

    return data.astype(return_type), num_edges, num_nodes, mean_k


[docs]def bootstrap_skel_stats(mv3d_pattern,
                         csv_pattern=None,
                         n_bootstrap=100000,
                         volume=None,
                         save_output=False,
                         data_output_fname=None,
                         err_output_fname=None):
    """
    Calculate errors for skeleton statistics, as output by the Avizo
    subsampling script. Operates on many .mv3d skeleton files

    Parameters
    ----------
    mv3d_pattern: str
        glob pattern to grab mv3d files to process from output of subvolume
        Avizo scripts. Based off of these filenames, corresponding .csv
        files containing the spatial graph statistics will be accessed as well.
        If the mv3d files are named:
        ``YMdA-01_labels.view.LSM.skel.am.subvolSkel.*.mv3d``
        The following pattern will be used to glob for the Spatial graph
        stats: ``YMdA-01_labels.view.LSM.skel.am.subvolSkel.*.csv``
    csv_pattern: :class:`str` or :data:`None`
        glob pattern to grab csv files to process from Avizo spatial graph
        output. If None, will attempt a calculation from the ``mv3d_pattern``
        value as described above
    n_bootstrap: int
        number of bootstrap samples to use when calculating confidence
        intervals
    volume: :data:`None` or :ref:`number <numbers>`
        volume of analyzed data cube. If this is given, data will be
        returned with nodes/volume and edges/volume given, in addition to
        the absolute values
    save_output: bool
        switch to control whether or not the output is written directly to a
        CSV file in the current directory
    data_output_fname: :data:`None` or :class:`str`
        filename to use when saving the data output; if None, an appropriate
        string will be built from the input ``pattern``
    err_output_fname: :data:`None` or :class:`str`
        filename to use when saving the error output; if None, an appropriate
        string will be built from the input ``pattern``

    Returns
    -------
    data_df: :class:`pandas.DataFrame`
        Dataframe with data from subvolume statistic calculations

    error_df: :class:`~pandas.DataFrame`
        Dataframe with low and high errors calculated using n_bootstrap
        samples
    """
    mv3d_filelist = glob.glob(mv3d_pattern)

    if len(mv3d_filelist) == 0:
        raise ValueError("Did not find any .mv3d files!")

    # Find index of second occurrence of a dot
    if csv_pattern is None:
        second_dot = mv3d_pattern.rindex('.', 0, mv3d_pattern.rindex('.') - 1)
        csv_pattern = mv3d_pattern[:second_dot] + '.*csv'
    csv_filelist = glob.glob(csv_pattern)

    # Control switching of csv calculations
    run_csv = False
    if len(csv_filelist) > 0:
        print(len(csv_filelist))
        run_csv = True
        if len(csv_filelist) != len(mv3d_filelist):
            print("mv3d pattern: " + mv3d_pattern)
            print("csv pattern: " + csv_pattern)
            raise ValueError("mv3d and csv filelists were different lengths")

    if save_output and data_output_fname is None:
        data_output_fname = mv3d_pattern[
                            :-1 * mv3d_pattern[::-1].index('*') - 1] + \
                            'bootstrap_data.csv'
    if save_output and err_output_fname is None:
        err_output_fname = mv3d_pattern[
                           :-1 * mv3d_pattern[::-1].index('*') - 1] + \
                           'bootstrap_errors.csv'

    # Blank results dataframe:
    if volume:
        # Include the volume normalized data, if volume given
        data_df = pd.DataFrame(columns=['E', 'E/V', 'N', 'N/V', 'mean_k'])
    else:
        data_df = pd.DataFrame(columns=['E', 'N', 'mean_k'])

    # Loop over all the mv3d files and add the results from find_nodes to
    # the data frame
    bar = tqdm(mv3d_filelist, desc='Running find_nodes')
    for mv3d_file in bar:
        data_df = data_df.append(
            dict(list(zip(['E', 'N', 'mean_k'],
                          list(
                              find_nodes(mv3d_file, save_output=False)[1:])))),
            ignore_index=True)
        if volume:
            data_df['N/V'] = data_df['N'] / volume
            data_df['E/V'] = data_df['E'] / volume

        bar.set_description('Running find_nodes on ' +
                            os.path.basename(mv3d_file))

    if run_csv:
        # Add columns of interest to the dataframe
        data_df["perc_deg"] = ""
        data_df["topo_length"] = ""

        # Loop over all the mv3d files and add the results from find_nodes to
        # the data frame
        bar = tqdm(csv_filelist, desc='Analyzing CSV files')
        for i, csv_file in enumerate(bar):
            # Read current file into a dataframe
            df = pd.read_csv(csv_file, skiprows=1)

            # Slice the dataframe to only include the rows before the
            # 'Total' row
            df = df.ix[:(df['Graph ID'] == 'Total').idxmax() - 1]

            # Set the index to the Graph ID column
            df = df.set_index('Graph ID')
            df.index.name = None

            # Sort by total length (descending)
            df = df.sort_values(by=['Total Length'], ascending=False)

            # Get total sum of network length:
            tot_length = df['Total Length'].sum()

            # Create fraction of total length and cumulative length columns:
            df['% total length'] = df['Total Length'] / tot_length
            df['Cum Length'] = df['Total Length'].cumsum()

            # Calculate percolation degree by finding the first graph  that
            # adds less than 1% to the total network length, when summed
            # cumulatively. perc_degree is the fraction of total network
            # length included at this point:

            try:
                perc_degree = df['Cum Length'][
                                  df['% total length'] < 0.01][0] / tot_length
            except IndexError as _:
                perc_degree = 1

            # Set percolation degree within data_df
            data_df.set_value(i, 'perc_deg', perc_degree)

            # Topological length is total network length divided by number
            # of nodes; add to data_df
            data_df.set_value(i, 'topo_length', tot_length / data_df['N'][i])

    error_df = calculate_errors(data_df, n_bootstrap)

    if save_output:
        data_df.to_csv(path_or_buf=data_output_fname)
        print(("Data output saved to {}".format(data_output_fname)))
        error_df.to_csv(path_or_buf=err_output_fname)
        print(("Error output saved to {}".format(err_output_fname)))

    return data_df, error_df