# Copyright 2016 Joshua Taillon
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
from .utils import calculate_errors
import pandas as pd
__all__ = ['bootstrap_tort_stats']
[docs]def bootstrap_tort_stats(csv_pattern=None,
n_bootstrap=100000,
thresh=0.75,
save_output=False,
data_output_fname=None,
err_output_fname=None):
"""
Calculate errors for tortuosity profiles, as output by
:py:mod:`FIBTortuosity <fibtortuosity>`
module. Operates on many .csv files, each with a single tortuosity
profile for a phase and direction (i.e. LSM-x, Pore-y, YSZ-z, etc.)
Parameters
----------
csv_pattern: str
glob pattern to grab csv files to process from output of tortuosity
calculations. Usually, this will be something like:
``os.path.join(<path holding files>, "*.csv")``
n_bootstrap: int
number of bootstrap samples to use when calculating confidence
intervals
thresh: float
value between 0 and 1, defining from what portion of the profiles to
calculate the errors. For example, for the default value of 0.75,
the error in the tortuosity for the last 25% of euclidean distance
values will be calculated. A ``thresh`` value of 0.0 would calculate
the error on the whole profile. Usually, only a small value towards
the end of the dataset is desired, so one can analyze how much the
data was changing towards the end of the profile.
save_output: bool
switch to control whether or not the bootstrap data and error output is
written directly to a CSV file in the current directory
data_output_fname: :data:`None` or :class:`str`
filename to use when saving the data output; if None, an appropriate
string will be built from the input ``pattern``
err_output_fname: :data:`None` or :class:`str`
filename to use when saving the error output; if None, an appropriate
string will be built from the input ``pattern``
Returns
-------
data_df: :class:`pandas.DataFrame`
Dataframe with data from subvolume statistic calculations
error_df: :class:`~pandas.DataFrame`
Dataframe with low and high errors calculated using n_bootstrap
samples
"""
csv_filelist = glob.glob(csv_pattern)
# Automagically determine file names for output (if necessary)
if save_output and data_output_fname is None:
data_output_fname = csv_pattern[
:-1 * csv_pattern[::-1].index('*') - 1] + \
'bootstrap_data.csv'
if save_output and err_output_fname is None:
err_output_fname = csv_pattern[
:-1 * csv_pattern[::-1].index('*') - 1] + \
'bootstrap_errors.csv'
# Initialize data frame and list to hold column names
total_df = pd.DataFrame()
names = []
# Loop over all the csv files and add the bootstrap results to the data
# frame
for file_ in csv_filelist:
with open(file_, 'r') as f:
# read first line to get name for dataset, and remove it
# from the file object
name = f.readline().strip('# ').strip(' \n')
# Read the csv profile into a pandas DataFrame, using dropna()
# to remove rows that do not have a valid tortuosity value
df = pd.read_csv(f,
skiprows=0,
escapechar='#',
# Spaces before 'nan' are necessary because of
# the way we saved the data in fibtortuosity
na_values=' nan').dropna(axis=0)
df.columns = ['Euc_d', 'tort'] # Explicitly rename columns
max_d = df['Euc_d'].iloc[-1] # Find last value of Euc_d
# Only consider tortuosity values in last (1 - thresh) fraction
# of euclidean distance values
last_torts = df[df['Euc_d'] > thresh * max_d].tort
names.append(name + ' tort') # Save name of this profile
# Remove the index from these values so we end up with a cleaner
# data frame in the next step
last_torts.reset_index(drop=True, inplace=True)
# Add tortuosity values to a new column in the data frame,
# making sure that we do not take the index of the values into
# account
total_df = pd.concat([total_df, pd.DataFrame(last_torts)],
ignore_index=True,
axis=1)
# Set names of data frame to include which profile is which
total_df.columns = names
# Calculate actual errors and save data
data_df = total_df
error_df = calculate_errors(total_df, n_bootstrap)
if save_output:
data_df.to_csv(path_or_buf=data_output_fname)
print("Data output saved to {}".format(data_output_fname))
error_df.to_csv(path_or_buf=err_output_fname)
print("Error output saved to {}".format(err_output_fname))
return data_df, error_df