Source code for FIBbootstrap.skeleton

# Copyright 2016 Joshua Taillon
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import glob
import numpy as np
from .utils import calculate_errors
import pandas as pd
from datetime import datetime as dt
from tqdm import tqdm

__all__ = ['bootstrap_skel_stats',
           'find_nodes']


# noinspection PyTypeChecker
[docs]def find_nodes(input_fname, node_output='nodes.txt', save_output=True, return_type='str'): """ Calculate and write a data file describing the connectivity of nodes within a network (saved in an .mv3d file from Avizo) Parameters ---------- input_fname: str .mv3d filename to read node_output: str name of text file to write to, if saving the output save_output: bool switch to control whether a file is written to disk (output format that matches the old :download:`findNodes.sh <../old_bash/findNodes.sh>` script) return_type: str passed to :py:meth:`numpy.ndarray.astype` to determine how the output should be formatted upon return. Default is as a string, so no information contained in the original .mv3d file is modified, but oftentimes ``'float32'`` would be more useful for calculating statistics Returns ------- data: :class:`~numpy.ndarray` Contains the output data in a numpy array with columns of: +-----------+----------+----------+----------+---------+ |:math:`k_i`|x position|y position|z position|thickness| +-----------+----------+----------+----------+---------+ num_edges: :class:`float` Number of edges (E) in skeleton num_nodes: :class:`float` Number of nodes (N) in skeleton mean_k: :class:`float` Average node connectivity (<k>) """ with open(input_fname, "r") as i_file: lines = i_file.readlines()[7:] # read data into list, removing header # Get the directory name: dir_name = os.path.abspath(os.path.join(os.path.abspath(input_fname), '..')) # remove empty lines: while '\n' in lines: lines.remove('\n') # replace tabs with spaces and trim newline characters for i, l in enumerate(lines): lines[i] = l.replace('\n', '').split('\t') # Convert to a numpy array, rather than just a list data = np.array(lines) # Find unique lines in first column (number of edges) num_edges = len(set(data[:, 0])) # Get first column, which is the index (number) of each edge edgenums = data[:, 0] # Determine where the end points of each edge are: end_idx = np.zeros(1, dtype='int') # always one at zero for i in np.where(np.diff(edgenums.astype('int')) == 1)[0]: end_idx = np.append(end_idx, i) # find where each edge ends, and add end_idx = np.append(end_idx, i + 1) # position and next start to list end_idx = np.append(end_idx, len(edgenums) - 1) # add the last point, too # end_points = data[end_idx] is now a list of edge end points end_points = data[end_idx] # join each row of the end points list into a single string string_list = [' '.join(row) for row in end_points[:, 1:]] # count how many times each end point shows up edges, counts = np.unique(string_list, return_counts=True) # put the counts before each unique edge, transpose it, then sort (and # reverse) the list, so the output matches the previous findNodes.sh script # noinspection PyUnusedLocal results = sorted([' '.join(s) for s in np.asarray((counts, edges)).T])[::-1] # Create array of all the data, with counts at the beginning data = np.c_[counts, np.array([i.split(' ') for i in edges])] # sort by counts column sort_idx = np.lexsort((data[:, 0],))[::-1] data = data[sort_idx] # Find number of nodes by taking the length of the data num_nodes = len(data) # Find average node connectivity mean_k = data.astype('float32')[:, 0].mean() if save_output: np.savetxt(node_output, data, fmt='%s', header="Python findNodes run at {0}\n\n" "List of nodes in input skeleton \"{1}\"\n" " from working directory \"{2}\"\n" "\n" "{3} total edges in skeleton\n" "{4} total nodes in skeleton\n" "\n" "of\n" "edges x\t\t\ty\t\t\t\tz\t\t\tThickness\n" "----------------------------------------------" "---------".format(dt.strftime(dt.now(), "%A - %b %d, " "%Y - %I:%M:%S %p"), os.path.basename(input_fname), dir_name, num_edges, num_nodes), delimiter='\t', comments='## ') return data.astype(return_type), num_edges, num_nodes, mean_k
[docs]def bootstrap_skel_stats(mv3d_pattern, csv_pattern=None, n_bootstrap=100000, volume=None, save_output=False, data_output_fname=None, err_output_fname=None): """ Calculate errors for skeleton statistics, as output by the Avizo subsampling script. Operates on many .mv3d skeleton files Parameters ---------- mv3d_pattern: str glob pattern to grab mv3d files to process from output of subvolume Avizo scripts. Based off of these filenames, corresponding .csv files containing the spatial graph statistics will be accessed as well. If the mv3d files are named: ``YMdA-01_labels.view.LSM.skel.am.subvolSkel.*.mv3d`` The following pattern will be used to glob for the Spatial graph stats: ``YMdA-01_labels.view.LSM.skel.am.subvolSkel.*.csv`` csv_pattern: :class:`str` or :data:`None` glob pattern to grab csv files to process from Avizo spatial graph output. If None, will attempt a calculation from the ``mv3d_pattern`` value as described above n_bootstrap: int number of bootstrap samples to use when calculating confidence intervals volume: :data:`None` or :ref:`number <numbers>` volume of analyzed data cube. If this is given, data will be returned with nodes/volume and edges/volume given, in addition to the absolute values save_output: bool switch to control whether or not the output is written directly to a CSV file in the current directory data_output_fname: :data:`None` or :class:`str` filename to use when saving the data output; if None, an appropriate string will be built from the input ``pattern`` err_output_fname: :data:`None` or :class:`str` filename to use when saving the error output; if None, an appropriate string will be built from the input ``pattern`` Returns ------- data_df: :class:`pandas.DataFrame` Dataframe with data from subvolume statistic calculations error_df: :class:`~pandas.DataFrame` Dataframe with low and high errors calculated using n_bootstrap samples """ mv3d_filelist = glob.glob(mv3d_pattern) if len(mv3d_filelist) == 0: raise ValueError("Did not find any .mv3d files!") # Find index of second occurrence of a dot if csv_pattern is None: second_dot = mv3d_pattern.rindex('.', 0, mv3d_pattern.rindex('.') - 1) csv_pattern = mv3d_pattern[:second_dot] + '.*csv' csv_filelist = glob.glob(csv_pattern) # Control switching of csv calculations run_csv = False if len(csv_filelist) > 0: print(len(csv_filelist)) run_csv = True if len(csv_filelist) != len(mv3d_filelist): print("mv3d pattern: " + mv3d_pattern) print("csv pattern: " + csv_pattern) raise ValueError("mv3d and csv filelists were different lengths") if save_output and data_output_fname is None: data_output_fname = mv3d_pattern[ :-1 * mv3d_pattern[::-1].index('*') - 1] + \ 'bootstrap_data.csv' if save_output and err_output_fname is None: err_output_fname = mv3d_pattern[ :-1 * mv3d_pattern[::-1].index('*') - 1] + \ 'bootstrap_errors.csv' # Blank results dataframe: if volume: # Include the volume normalized data, if volume given data_df = pd.DataFrame(columns=['E', 'E/V', 'N', 'N/V', 'mean_k']) else: data_df = pd.DataFrame(columns=['E', 'N', 'mean_k']) # Loop over all the mv3d files and add the results from find_nodes to # the data frame bar = tqdm(mv3d_filelist, desc='Running find_nodes') for mv3d_file in bar: data_df = data_df.append( dict(list(zip(['E', 'N', 'mean_k'], list( find_nodes(mv3d_file, save_output=False)[1:])))), ignore_index=True) if volume: data_df['N/V'] = data_df['N'] / volume data_df['E/V'] = data_df['E'] / volume bar.set_description('Running find_nodes on ' + os.path.basename(mv3d_file)) if run_csv: # Add columns of interest to the dataframe data_df["perc_deg"] = "" data_df["topo_length"] = "" # Loop over all the mv3d files and add the results from find_nodes to # the data frame bar = tqdm(csv_filelist, desc='Analyzing CSV files') for i, csv_file in enumerate(bar): # Read current file into a dataframe df = pd.read_csv(csv_file, skiprows=1) # Slice the dataframe to only include the rows before the # 'Total' row df = df.ix[:(df['Graph ID'] == 'Total').idxmax() - 1] # Set the index to the Graph ID column df = df.set_index('Graph ID') df.index.name = None # Sort by total length (descending) df = df.sort_values(by=['Total Length'], ascending=False) # Get total sum of network length: tot_length = df['Total Length'].sum() # Create fraction of total length and cumulative length columns: df['% total length'] = df['Total Length'] / tot_length df['Cum Length'] = df['Total Length'].cumsum() # Calculate percolation degree by finding the first graph that # adds less than 1% to the total network length, when summed # cumulatively. perc_degree is the fraction of total network # length included at this point: try: perc_degree = df['Cum Length'][ df['% total length'] < 0.01][0] / tot_length except IndexError as _: perc_degree = 1 # Set percolation degree within data_df data_df.set_value(i, 'perc_deg', perc_degree) # Topological length is total network length divided by number # of nodes; add to data_df data_df.set_value(i, 'topo_length', tot_length / data_df['N'][i]) error_df = calculate_errors(data_df, n_bootstrap) if save_output: data_df.to_csv(path_or_buf=data_output_fname) print(("Data output saved to {}".format(data_output_fname))) error_df.to_csv(path_or_buf=err_output_fname) print(("Error output saved to {}".format(err_output_fname))) return data_df, error_df