Source code for nestcheck.pandas_functions

#!/usr/bin/env python
"""
Useful transformations and operations on pandas DataFrames.
"""

import copy
import warnings
import numpy as np
import pandas as pd


[docs]def summary_df_from_array(results_array, names, axis=0, **kwargs):
    """Make a panda data frame of the mean and std devs of an array of results,
    including the uncertainties on the values.

    This function converts the array to a DataFrame and calls summary_df on it.

    Parameters
    ----------
    results_array: 2d numpy array
    names: list of str
        Names for the output df's columns.
    axis: int, optional
        Axis on which to calculate summary statistics.

    Returns
    -------
    df: MultiIndex DataFrame
        See summary_df docstring for more details.
    """
    assert axis == 0 or axis == 1
    df = pd.DataFrame(results_array)
    if axis == 1:
        df = df.T
    df.columns = names
    return summary_df(df, **kwargs)


[docs]def summary_df_from_list(results_list, names, **kwargs):
    """Make a panda data frame of the mean and std devs of each element of a
    list of 1d arrays, including the uncertainties on the values.

    This just converts the array to a DataFrame and calls summary_df on it.

    Parameters
    ----------
    results_list: list of 1d numpy arrays
        Must have same length as names.
    names: list of strs
        Names for the output df's columns.
    kwargs: dict, optional
        Keyword arguments to pass to summary_df.

    Returns
    -------
    df: MultiIndex DataFrame
        See summary_df docstring for more details.
    """
    for arr in results_list:
        assert arr.shape == (len(names),)
    df = pd.DataFrame(np.stack(results_list, axis=0))
    df.columns = names
    return summary_df(df, **kwargs)


[docs]def summary_df_from_multi(multi_in, inds_to_keep=None, **kwargs):
    """Apply summary_df to a multiindex while preserving some levels.

    Parameters
    ----------
    multi_in: multiindex pandas DataFrame
    inds_to_keep: None or list of strs, optional
        Index levels to preserve.
    kwargs: dict, optional
        Keyword arguments to pass to summary_df.

    Returns
    -------
    df: MultiIndex DataFrame
        See summary_df docstring for more details.
    """
    # Need to pop include true values and add separately at the end as
    # otherwise we get multiple true values added
    include_true_values = kwargs.pop('include_true_values', False)
    true_values = kwargs.get('true_values', None)
    if inds_to_keep is None:
        inds_to_keep = list(multi_in.index.names)[:-1]
    if 'calculation type' not in inds_to_keep:
        df = multi_in.groupby(inds_to_keep).apply(
            summary_df, include_true_values=False, **kwargs)
    else:
        # If there is already a level called 'calculation type' in multi,
        # summary_df will try making a second 'calculation type' index and (as
        # of pandas v0.23.0) throw an error. Avoid this by renaming.
        inds_to_keep = [lev if lev != 'calculation type' else
                        'calculation type temp' for lev in inds_to_keep]
        multi_temp = copy.deepcopy(multi_in)
        multi_temp.index.set_names(
            [lev if lev != 'calculation type' else 'calculation type temp' for
             lev in list(multi_temp.index.names)], inplace=True)
        df = multi_temp.groupby(inds_to_keep).apply(
            summary_df, include_true_values=False, **kwargs)
        # add the 'calculation type' values ('mean' and 'std') produced by
        # summary_df to the input calculation type names (now in level
        # 'calculation type temp')
        ind = (df.index.get_level_values('calculation type temp') + ' ' +
               df.index.get_level_values('calculation type'))
        order = list(df.index.names)
        order.remove('calculation type temp')
        df.index = df.index.droplevel(
            ['calculation type', 'calculation type temp'])
        df['calculation type'] = list(ind)
        df.set_index('calculation type', append=True, inplace=True)
        df = df.reorder_levels(order)
    if include_true_values:
        assert true_values is not None
        tv_ind = ['true values' if name == 'calculation type' else '' for
                  name in df.index.names[:-1]] + ['value']
        df.loc[tuple(tv_ind), :] = true_values
    return df


[docs]def summary_df(df_in, **kwargs):
    """Make a panda data frame of the mean and std devs of an array of results,
    including the uncertainties on the values.

    This is similar to pandas.DataFrame.describe but also includes estimates of
    the numerical uncertainties.

    The output DataFrame has multiindex levels:

    'calculation type':  mean and standard deviations of the data.
    'result type': value and uncertainty for each quantity.

    calculation type    result type        column_1     column_2      ...
    mean                value
    mean                uncertainty
    std                 value
    std                 uncertainty

    Parameters
    ----------
    df_in: pandas DataFrame
    true_values: array
        Analytical values if known for comparison with mean. Used to
        calculate root mean squared errors (RMSE).
    include_true_values: bool, optional
        Whether or not to include true values in the output DataFrame.
    include_rmse: bool, optional
        Whether or not to include root-mean-squared-errors in the output
        DataFrame.


    Returns
    -------
    df: MultiIndex DataFrame
    """
    true_values = kwargs.pop('true_values', None)
    include_true_values = kwargs.pop('include_true_values', False)
    include_rmse = kwargs.pop('include_rmse', False)
    if kwargs:
        raise TypeError('Unexpected **kwargs: {0}'.format(kwargs))
    if true_values is not None:
        assert true_values.shape[0] == df_in.shape[1], (
            'There should be one true value for every column! '
            'true_values.shape=' + str(true_values.shape) + ', '
            'df_in.shape=' + str(df_in.shape))
    # make the data frame
    df = pd.DataFrame([df_in.mean(axis=0), df_in.std(axis=0, ddof=1)],
                      index=['mean', 'std'])
    if include_true_values:
        assert true_values is not None
        df.loc['true values'] = true_values
    # Make index categorical to allow sorting
    df.index = pd.CategoricalIndex(df.index.values, ordered=True,
                                   categories=['true values', 'mean', 'std',
                                               'rmse'],
                                   name='calculation type')
    # add uncertainties
    num_cals = df_in.shape[0]
    mean_unc = df.loc['std'] / np.sqrt(num_cals)
    std_unc = df.loc['std'] * np.sqrt(1 / (2 * (num_cals - 1)))
    df['result type'] = pd.Categorical(['value'] * df.shape[0], ordered=True,
                                       categories=['value', 'uncertainty'])
    df.set_index(['result type'], drop=True, append=True, inplace=True)
    df.loc[('mean', 'uncertainty'), :] = mean_unc.values
    df.loc[('std', 'uncertainty'), :] = std_unc.values
    if include_rmse:
        assert true_values is not None, \
            'Need to input true values for RMSE!'
        rmse, rmse_unc = rmse_and_unc(df_in.values, true_values)
        df.loc[('rmse', 'value'), :] = rmse
        df.loc[('rmse', 'uncertainty'), :] = rmse_unc
    # Ensure correct row order by sorting
    df.sort_index(inplace=True)
    # Cast calculation type index back from categorical to string to allow
    # adding new calculation types
    df.set_index(
        [df.index.get_level_values('calculation type').astype(str),
         df.index.get_level_values('result type')],
        inplace=True)
    return df


[docs]def efficiency_gain_df(method_names, method_values, est_names, **kwargs):
    r"""Calculated data frame showing

    .. math::

        \mathrm{efficiency\,gain}
        =
        \frac{\mathrm{Var[base\,method]}}{\mathrm{Var[new\,method]}}

    See the dynamic nested sampling paper (Higson et al. 2019) for more
    details.

    The standard method on which to base the gain is assumed to be the first
    method input.

    The output DataFrame will contain rows:
        mean [dynamic goal]: mean calculation result for standard nested
            sampling and dynamic nested sampling with each input dynamic
            goal.
        std [dynamic goal]: standard deviation of results for standard
            nested sampling and dynamic nested sampling with each input
            dynamic goal.
        gain [dynamic goal]: the efficiency gain (computational speedup)
            from dynamic nested sampling compared to standard nested
            sampling. This equals (variance of standard results) /
            (variance of dynamic results); see the dynamic nested
            sampling paper for more details.

    Parameters
    ----------
    method names: list of strs
    method values: list
        Each element is a list of 1d arrays of results for the method. Each
        array must have shape (len(est_names),).
    est_names: list of strs
        Provide column titles for output df.
    true_values: iterable of same length as estimators list
        True values of the estimators for the given likelihood and prior.

    Returns
    -------
    results: pandas data frame
        Results data frame.
    """
    true_values = kwargs.pop('true_values', None)
    include_true_values = kwargs.pop('include_true_values', False)
    include_rmse = kwargs.pop('include_rmse', False)
    adjust_nsamp = kwargs.pop('adjust_nsamp', None)
    if kwargs:
        raise TypeError('Unexpected **kwargs: {0}'.format(kwargs))
    if adjust_nsamp is not None:
        assert adjust_nsamp.shape == (len(method_names),)
    assert len(method_names) == len(method_values)
    df_dict = {}
    for i, method_name in enumerate(method_names):
        # Set include_true_values=False as we don't want them repeated for
        # every method
        df = summary_df_from_list(
            method_values[i], est_names, true_values=true_values,
            include_true_values=False, include_rmse=include_rmse)
        if i != 0:
            stats = ['std']
            if include_rmse:
                stats.append('rmse')
            if adjust_nsamp is not None:
                # Efficiency gain measures performance per number of
                # samples (proportional to computational work). If the
                # number of samples is not the same we can adjust this.
                adjust = (adjust_nsamp[0] / adjust_nsamp[i])
            else:
                adjust = 1
            for stat in stats:
                # Calculate efficiency gain vs standard nested sampling
                gain, gain_unc = get_eff_gain(
                    df_dict[method_names[0]].loc[(stat, 'value')],
                    df_dict[method_names[0]].loc[(stat, 'uncertainty')],
                    df.loc[(stat, 'value')],
                    df.loc[(stat, 'uncertainty')], adjust=adjust)
                key = stat + ' efficiency gain'
                df.loc[(key, 'value'), :] = gain
                df.loc[(key, 'uncertainty'), :] = gain_unc
        df_dict[method_name] = df
    results = pd.concat(df_dict)
    results.index.rename('dynamic settings', level=0, inplace=True)
    new_ind = []
    new_ind.append(pd.CategoricalIndex(
        results.index.get_level_values('calculation type'), ordered=True,
        categories=['true values', 'mean', 'std', 'rmse',
                    'std efficiency gain', 'rmse efficiency gain']))
    new_ind.append(pd.CategoricalIndex(
        results.index.get_level_values('dynamic settings'),
        ordered=True, categories=[''] + method_names))
    new_ind.append(results.index.get_level_values('result type'))
    results.set_index(new_ind, inplace=True)
    if include_true_values:
        with warnings.catch_warnings():
            # Performance not an issue here so suppress annoying warning
            warnings.filterwarnings('ignore', message=(
                'indexing past lexsort depth may impact performance.'))
            results.loc[('true values', '', 'value'), :] = true_values
    results.sort_index(inplace=True)
    return results


[docs]def paper_format_efficiency_gain_df(eff_gain_df):
    """Transform efficiency gain data frames output by nestcheck into the
    format shown in the dynamic nested sampling paper (Higson et al. 2019).

    Parameters
    ----------
    eff_gain_df: pandas DataFrame
        DataFrame of the from produced by efficiency_gain_df.

    Returns
    -------
    paper_df: pandas DataFrame
    """
    idxs = pd.IndexSlice[['std', 'std efficiency gain'], :, :]
    paper_df = copy.deepcopy(eff_gain_df.loc[idxs, :])
    # Show mean number of samples and likelihood calls instead of st dev
    means = (eff_gain_df.xs('mean', level='calculation type')
             .xs('value', level='result type'))
    for col in ['samples', 'likelihood calls']:
        try:
            col_vals = []
            for val in means[col].values:
                col_vals += [int(np.rint(val)), np.nan]
            col_vals += [np.nan] * (paper_df.shape[0] - len(col_vals))
            paper_df[col] = col_vals
        except KeyError:
            pass
    row_name_map = {'std efficiency gain': 'Efficiency gain',
                    'St.Dev. efficiency gain': 'Efficiency gain',
                    'dynamic ': '',
                    'std': 'St.Dev.'}
    row_names = (paper_df.index.get_level_values(0).astype(str) + ' ' +
                 paper_df.index.get_level_values(1).astype(str))
    for key, value in row_name_map.items():
        row_names = row_names.str.replace(key, value)
    paper_df.index = [row_names, paper_df.index.get_level_values(2)]
    return paper_df


# Helper functions
# ----------------


[docs]def get_eff_gain(base_std, base_std_unc, meth_std, meth_std_unc, adjust=1):
    r"""Calculates efficiency gain for a new method compared to a base method.
    Given the variation in repeated calculations' results using the two
    methods, the efficiency gain is:

    .. math::

        \mathrm{efficiency\,gain}
        =
        \frac{\mathrm{Var[base\,method]}}{\mathrm{Var[new\,method]}}

    The uncertainty on the efficiency gain is also calculated.

    See the dynamic nested sampling paper (Higson et al. 2019) for more
    details.

    Parameters
    ----------
    base_std: 1d numpy array
    base_std_unc: 1d numpy array
        Uncertainties on base_std.
    meth_std: 1d numpy array
    meth_std_unc: 1d numpy array
        Uncertainties on base_std.

    Returns
    -------
    gain: 1d numpy array
    gain_unc: 1d numpy array
        Uncertainties on gain.
    """
    ratio = base_std / meth_std
    ratio_unc = array_ratio_std(
        base_std, base_std_unc, meth_std, meth_std_unc)
    gain = ratio ** 2
    gain_unc = 2 * ratio * ratio_unc
    gain *= adjust
    gain_unc *= adjust
    return gain, gain_unc


[docs]def rmse_and_unc(values_array, true_values):
    r"""Calculate the root meet squared error and its numerical uncertainty.

    With a reasonably large number of values in values_list the uncertainty
    on sq_errors should be approximately normal (from the central limit
    theorem).
    Uncertainties are calculated via error propagation: if :math:`\sigma`
    is the error on :math:`X` then the error on :math:`\sqrt{X}`
    is :math:`\frac{\sigma}{2 \sqrt{X}}`.

    Parameters
    ----------
    values_array: 2d numpy array
        Array of results: each row corresponds to a different estimate of the
        quantities considered.
    true_values: 1d numpy array
        Correct values for the quantities considered.

    Returns
    -------
    rmse: 1d numpy array
        Root-mean-squared-error for each quantity.
    rmse_unc: 1d numpy array
        Numerical uncertainties on each element of rmse.
    """
    assert true_values.shape == (values_array.shape[1],)
    errors = values_array - true_values[np.newaxis, :]
    sq_errors = errors ** 2
    sq_errors_mean = np.mean(sq_errors, axis=0)
    sq_errors_mean_unc = (np.std(sq_errors, axis=0, ddof=1) /
                          np.sqrt(sq_errors.shape[0]))
    rmse = np.sqrt(sq_errors_mean)
    rmse_unc = 0.5 * (1 / rmse) * sq_errors_mean_unc
    return rmse, rmse_unc


[docs]def array_ratio_std(values_n, sigmas_n, values_d, sigmas_d):
    r"""Gives error on the ratio of 2 floats or 2 1-dimensional arrays given
    their values and uncertainties. This assumes the covariance = 0, and that
    the input uncertainties are small compared to the corresponding input
    values. _n and _d denote the numerator and denominator respectively.

    Parameters
    ----------
    values_n: float or numpy array
        Numerator values.
    sigmas_n: float or numpy array
        :math:`1\sigma` uncertainties on values_n.
    values_d: float or numpy array
        Denominator values.
    sigmas_d: float or numpy array
        :math:`1\sigma` uncertainties on values_d.

    Returns
    -------
    std: float or numpy array
        :math:`1\sigma` uncertainty on values_n / values_d.
    """
    std = np.sqrt((sigmas_n / values_n) ** 2 + (sigmas_d / values_d) ** 2)
    std *= (values_n / values_d)
    return std