Source code for FIBbootstrap.utils

# Copyright 2016 Joshua Taillon
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import numpy as np
from scikits import bootstrap
from tqdm import tqdm
from scipy import mean as sci_mean
from math import sqrt

__all__ = ['calculate_errors']


[docs]def calculate_errors(df,
                     samples):
    """
    Calculate the "error bars" of each column in a Pandas dataframe

    Parameters
    ----------
    df: ~pandas.DataFrame
        dataframe on which to calculate
    samples: int
        number of bootstrap samples to use

    Returns
    -------
    result: ~pandas.DataFrame
        dataframe with ``-`` and ``+`` error values (and mean) for
        each column in df
    """
    result = pd.DataFrame(index=['Neg. CI', 'Pos. CI', '---', 'Mean',
                                 'Std. Dev.', 'SEM'],
                          columns=df.columns)

    col = df.columns[0]
    bar = tqdm(df.columns, desc='Bootstrapping confidence intervals (' +
                                col + ')')

    for i, col in enumerate(bar):
        x = bootstrap.ci(data=df[col].dropna(axis=0),
                         n_samples=samples,
                         statfunction=sci_mean, output='errorbar')

        # noinspection PyUnresolvedReferences
        result[col] = pd.Series(np.append(x.flatten(),
                                          ['---',
                                           df[col].mean(),
                                           df[col].std(),
                                           df[col].std() /
                                           sqrt(df[col].count())]),
                                index=['Neg. CI', 'Pos. CI', '---', 'Mean',
                                       'Std. Dev.', 'SEM'])

        try:
            next_col = df.columns[i + 1]
        except IndexError as _:
            next_col = df.columns[i]
        bar.set_description(desc='Bootstrapping confidence intervals (' +
                                 next_col + ')')

    return result