Source code for FIBbootstrap.tortuosity

# Copyright 2016 Joshua Taillon
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
from .utils import calculate_errors
import pandas as pd

__all__ = ['bootstrap_tort_stats']

[docs]def bootstrap_tort_stats(csv_pattern=None,
                         n_bootstrap=100000,
                         thresh=0.75,
                         save_output=False,
                         data_output_fname=None,
                         err_output_fname=None):
    """
    Calculate errors for tortuosity profiles, as output by
    :py:mod:`FIBTortuosity <fibtortuosity>`
    module. Operates on many .csv files, each with a single tortuosity
    profile for a phase and direction (i.e. LSM-x, Pore-y, YSZ-z, etc.)

    Parameters
    ----------
    csv_pattern: str
        glob pattern to grab csv files to process from output of tortuosity
        calculations. Usually, this will be something like:
        ``os.path.join(<path holding files>, "*.csv")``
    n_bootstrap: int
        number of bootstrap samples to use when calculating confidence
        intervals
    thresh: float
        value between 0 and 1, defining from what portion of the profiles to
        calculate the errors. For example, for the default value of 0.75,
        the error in the tortuosity for the last 25% of euclidean distance
        values will be calculated. A ``thresh`` value of 0.0 would calculate
        the error on the whole profile. Usually, only a small value towards
        the end of the dataset is desired, so one can analyze how much the
        data was changing towards the end of the profile.
    save_output: bool
        switch to control whether or not the bootstrap data and error output is
        written directly to a CSV file in the current directory
    data_output_fname: :data:`None` or :class:`str`
        filename to use when saving the data output; if None, an appropriate
        string will be built from the input ``pattern``
    err_output_fname: :data:`None` or :class:`str`
        filename to use when saving the error output; if None, an appropriate
        string will be built from the input ``pattern``

    Returns
    -------
    data_df: :class:`pandas.DataFrame`
        Dataframe with data from subvolume statistic calculations
    error_df: :class:`~pandas.DataFrame`
        Dataframe with low and high errors calculated using n_bootstrap
        samples
    """
    csv_filelist = glob.glob(csv_pattern)

    # Automagically determine file names for output (if necessary)
    if save_output and data_output_fname is None:
        data_output_fname = csv_pattern[
                            :-1 * csv_pattern[::-1].index('*') - 1] + \
                            'bootstrap_data.csv'

    if save_output and err_output_fname is None:
        err_output_fname = csv_pattern[
                           :-1 * csv_pattern[::-1].index('*') - 1] + \
                           'bootstrap_errors.csv'

    # Initialize data frame and list to hold column names
    total_df = pd.DataFrame()
    names = []

    # Loop over all the csv files and add the bootstrap results to the data
    # frame
    for file_ in csv_filelist:
        with open(file_, 'r') as f:
            # read first line to get name for dataset, and remove it
            # from the file object
            name = f.readline().strip('# ').strip(' \n')

            # Read the csv profile into a pandas DataFrame, using dropna()
            # to remove rows that do not have a valid tortuosity value
            df = pd.read_csv(f,
                             skiprows=0,
                             escapechar='#',
                             # Spaces before 'nan' are necessary because  of
                             # the way we saved the data in fibtortuosity
                             na_values='       nan').dropna(axis=0)
            df.columns = ['Euc_d', 'tort']  # Explicitly rename columns
            max_d = df['Euc_d'].iloc[-1]  # Find last value of Euc_d
            # Only consider tortuosity values in last (1 - thresh) fraction
            # of euclidean distance values
            last_torts = df[df['Euc_d'] > thresh * max_d].tort
            names.append(name + ' tort')  # Save name of this profile

            # Remove the index from these values so we end up with a cleaner
            # data frame in the next step
            last_torts.reset_index(drop=True, inplace=True)

            # Add tortuosity values to a new column in the data frame,
            # making sure that we do not take the index of the values into
            # account
            total_df = pd.concat([total_df, pd.DataFrame(last_torts)],
                                 ignore_index=True,
                                 axis=1)

    # Set names of data frame to include which profile is which
    total_df.columns = names

    # Calculate actual errors and save data
    data_df = total_df
    error_df = calculate_errors(total_df, n_bootstrap)

    if save_output:
        data_df.to_csv(path_or_buf=data_output_fname)
        print("Data output saved to {}".format(data_output_fname))
        error_df.to_csv(path_or_buf=err_output_fname)
        print("Error output saved to {}".format(err_output_fname))

    return data_df, error_df