# Copyright 2016 Joshua Taillon
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import glob
import numpy as np
from .utils import calculate_errors
import pandas as pd
from datetime import datetime as dt
from tqdm import tqdm
__all__ = ['bootstrap_skel_stats',
'find_nodes']
# noinspection PyTypeChecker
[docs]def find_nodes(input_fname,
node_output='nodes.txt',
save_output=True,
return_type='str'):
"""
Calculate and write a data file describing the connectivity of nodes
within a network (saved in an .mv3d file from Avizo)
Parameters
----------
input_fname: str
.mv3d filename to read
node_output: str
name of text file to write to, if saving the output
save_output: bool
switch to control whether a file is written to disk (output format
that matches the old
:download:`findNodes.sh <../old_bash/findNodes.sh>` script)
return_type: str
passed to :py:meth:`numpy.ndarray.astype` to determine how the output
should be formatted upon return. Default is as a string,
so no information contained in the original .mv3d file is modified,
but oftentimes ``'float32'`` would be more useful for calculating
statistics
Returns
-------
data: :class:`~numpy.ndarray`
Contains the output data in a numpy array with columns of:
+-----------+----------+----------+----------+---------+
|:math:`k_i`|x position|y position|z position|thickness|
+-----------+----------+----------+----------+---------+
num_edges: :class:`float`
Number of edges (E) in skeleton
num_nodes: :class:`float`
Number of nodes (N) in skeleton
mean_k: :class:`float`
Average node connectivity (<k>)
"""
with open(input_fname, "r") as i_file:
lines = i_file.readlines()[7:] # read data into list, removing header
# Get the directory name:
dir_name = os.path.abspath(os.path.join(os.path.abspath(input_fname),
'..'))
# remove empty lines:
while '\n' in lines:
lines.remove('\n')
# replace tabs with spaces and trim newline characters
for i, l in enumerate(lines):
lines[i] = l.replace('\n', '').split('\t')
# Convert to a numpy array, rather than just a list
data = np.array(lines)
# Find unique lines in first column (number of edges)
num_edges = len(set(data[:, 0]))
# Get first column, which is the index (number) of each edge
edgenums = data[:, 0]
# Determine where the end points of each edge are:
end_idx = np.zeros(1, dtype='int') # always one at zero
for i in np.where(np.diff(edgenums.astype('int')) == 1)[0]:
end_idx = np.append(end_idx, i) # find where each edge ends, and add
end_idx = np.append(end_idx, i + 1) # position and next start to list
end_idx = np.append(end_idx, len(edgenums) - 1) # add the last point, too
# end_points = data[end_idx] is now a list of edge end points
end_points = data[end_idx]
# join each row of the end points list into a single string
string_list = [' '.join(row) for row in end_points[:, 1:]]
# count how many times each end point shows up
edges, counts = np.unique(string_list, return_counts=True)
# put the counts before each unique edge, transpose it, then sort (and
# reverse) the list, so the output matches the previous findNodes.sh script
# noinspection PyUnusedLocal
results = sorted([' '.join(s) for
s in np.asarray((counts, edges)).T])[::-1]
# Create array of all the data, with counts at the beginning
data = np.c_[counts, np.array([i.split(' ') for i in edges])]
# sort by counts column
sort_idx = np.lexsort((data[:, 0],))[::-1]
data = data[sort_idx]
# Find number of nodes by taking the length of the data
num_nodes = len(data)
# Find average node connectivity
mean_k = data.astype('float32')[:, 0].mean()
if save_output:
np.savetxt(node_output,
data,
fmt='%s',
header="Python findNodes run at {0}\n\n"
"List of nodes in input skeleton \"{1}\"\n"
" from working directory \"{2}\"\n"
"\n"
"{3} total edges in skeleton\n"
"{4} total nodes in skeleton\n"
"\n"
"of\n"
"edges x\t\t\ty\t\t\t\tz\t\t\tThickness\n"
"----------------------------------------------"
"---------".format(dt.strftime(dt.now(),
"%A - %b %d, "
"%Y - %I:%M:%S %p"),
os.path.basename(input_fname),
dir_name,
num_edges,
num_nodes),
delimiter='\t',
comments='## ')
return data.astype(return_type), num_edges, num_nodes, mean_k
[docs]def bootstrap_skel_stats(mv3d_pattern,
csv_pattern=None,
n_bootstrap=100000,
volume=None,
save_output=False,
data_output_fname=None,
err_output_fname=None):
"""
Calculate errors for skeleton statistics, as output by the Avizo
subsampling script. Operates on many .mv3d skeleton files
Parameters
----------
mv3d_pattern: str
glob pattern to grab mv3d files to process from output of subvolume
Avizo scripts. Based off of these filenames, corresponding .csv
files containing the spatial graph statistics will be accessed as well.
If the mv3d files are named:
``YMdA-01_labels.view.LSM.skel.am.subvolSkel.*.mv3d``
The following pattern will be used to glob for the Spatial graph
stats: ``YMdA-01_labels.view.LSM.skel.am.subvolSkel.*.csv``
csv_pattern: :class:`str` or :data:`None`
glob pattern to grab csv files to process from Avizo spatial graph
output. If None, will attempt a calculation from the ``mv3d_pattern``
value as described above
n_bootstrap: int
number of bootstrap samples to use when calculating confidence
intervals
volume: :data:`None` or :ref:`number <numbers>`
volume of analyzed data cube. If this is given, data will be
returned with nodes/volume and edges/volume given, in addition to
the absolute values
save_output: bool
switch to control whether or not the output is written directly to a
CSV file in the current directory
data_output_fname: :data:`None` or :class:`str`
filename to use when saving the data output; if None, an appropriate
string will be built from the input ``pattern``
err_output_fname: :data:`None` or :class:`str`
filename to use when saving the error output; if None, an appropriate
string will be built from the input ``pattern``
Returns
-------
data_df: :class:`pandas.DataFrame`
Dataframe with data from subvolume statistic calculations
error_df: :class:`~pandas.DataFrame`
Dataframe with low and high errors calculated using n_bootstrap
samples
"""
mv3d_filelist = glob.glob(mv3d_pattern)
if len(mv3d_filelist) == 0:
raise ValueError("Did not find any .mv3d files!")
# Find index of second occurrence of a dot
if csv_pattern is None:
second_dot = mv3d_pattern.rindex('.', 0, mv3d_pattern.rindex('.') - 1)
csv_pattern = mv3d_pattern[:second_dot] + '.*csv'
csv_filelist = glob.glob(csv_pattern)
# Control switching of csv calculations
run_csv = False
if len(csv_filelist) > 0:
print(len(csv_filelist))
run_csv = True
if len(csv_filelist) != len(mv3d_filelist):
print("mv3d pattern: " + mv3d_pattern)
print("csv pattern: " + csv_pattern)
raise ValueError("mv3d and csv filelists were different lengths")
if save_output and data_output_fname is None:
data_output_fname = mv3d_pattern[
:-1 * mv3d_pattern[::-1].index('*') - 1] + \
'bootstrap_data.csv'
if save_output and err_output_fname is None:
err_output_fname = mv3d_pattern[
:-1 * mv3d_pattern[::-1].index('*') - 1] + \
'bootstrap_errors.csv'
# Blank results dataframe:
if volume:
# Include the volume normalized data, if volume given
data_df = pd.DataFrame(columns=['E', 'E/V', 'N', 'N/V', 'mean_k'])
else:
data_df = pd.DataFrame(columns=['E', 'N', 'mean_k'])
# Loop over all the mv3d files and add the results from find_nodes to
# the data frame
bar = tqdm(mv3d_filelist, desc='Running find_nodes')
for mv3d_file in bar:
data_df = data_df.append(
dict(list(zip(['E', 'N', 'mean_k'],
list(
find_nodes(mv3d_file, save_output=False)[1:])))),
ignore_index=True)
if volume:
data_df['N/V'] = data_df['N'] / volume
data_df['E/V'] = data_df['E'] / volume
bar.set_description('Running find_nodes on ' +
os.path.basename(mv3d_file))
if run_csv:
# Add columns of interest to the dataframe
data_df["perc_deg"] = ""
data_df["topo_length"] = ""
# Loop over all the mv3d files and add the results from find_nodes to
# the data frame
bar = tqdm(csv_filelist, desc='Analyzing CSV files')
for i, csv_file in enumerate(bar):
# Read current file into a dataframe
df = pd.read_csv(csv_file, skiprows=1)
# Slice the dataframe to only include the rows before the
# 'Total' row
df = df.ix[:(df['Graph ID'] == 'Total').idxmax() - 1]
# Set the index to the Graph ID column
df = df.set_index('Graph ID')
df.index.name = None
# Sort by total length (descending)
df = df.sort_values(by=['Total Length'], ascending=False)
# Get total sum of network length:
tot_length = df['Total Length'].sum()
# Create fraction of total length and cumulative length columns:
df['% total length'] = df['Total Length'] / tot_length
df['Cum Length'] = df['Total Length'].cumsum()
# Calculate percolation degree by finding the first graph that
# adds less than 1% to the total network length, when summed
# cumulatively. perc_degree is the fraction of total network
# length included at this point:
try:
perc_degree = df['Cum Length'][
df['% total length'] < 0.01][0] / tot_length
except IndexError as _:
perc_degree = 1
# Set percolation degree within data_df
data_df.set_value(i, 'perc_deg', perc_degree)
# Topological length is total network length divided by number
# of nodes; add to data_df
data_df.set_value(i, 'topo_length', tot_length / data_df['N'][i])
error_df = calculate_errors(data_df, n_bootstrap)
if save_output:
data_df.to_csv(path_or_buf=data_output_fname)
print(("Data output saved to {}".format(data_output_fname)))
error_df.to_csv(path_or_buf=err_output_fname)
print(("Error output saved to {}".format(err_output_fname)))
return data_df, error_df