#!/usr/bin/env python
"""
Useful transformations and operations on pandas DataFrames.
"""
import copy
import warnings
import numpy as np
import pandas as pd
[docs]def summary_df_from_array(results_array, names, axis=0, **kwargs):
"""Make a panda data frame of the mean and std devs of an array of results,
including the uncertainties on the values.
This function converts the array to a DataFrame and calls summary_df on it.
Parameters
----------
results_array: 2d numpy array
names: list of str
Names for the output df's columns.
axis: int, optional
Axis on which to calculate summary statistics.
Returns
-------
df: MultiIndex DataFrame
See summary_df docstring for more details.
"""
assert axis == 0 or axis == 1
df = pd.DataFrame(results_array)
if axis == 1:
df = df.T
df.columns = names
return summary_df(df, **kwargs)
[docs]def summary_df_from_list(results_list, names, **kwargs):
"""Make a panda data frame of the mean and std devs of each element of a
list of 1d arrays, including the uncertainties on the values.
This just converts the array to a DataFrame and calls summary_df on it.
Parameters
----------
results_list: list of 1d numpy arrays
Must have same length as names.
names: list of strs
Names for the output df's columns.
kwargs: dict, optional
Keyword arguments to pass to summary_df.
Returns
-------
df: MultiIndex DataFrame
See summary_df docstring for more details.
"""
for arr in results_list:
assert arr.shape == (len(names),)
df = pd.DataFrame(np.stack(results_list, axis=0))
df.columns = names
return summary_df(df, **kwargs)
[docs]def summary_df_from_multi(multi_in, inds_to_keep=None, **kwargs):
"""Apply summary_df to a multiindex while preserving some levels.
Parameters
----------
multi_in: multiindex pandas DataFrame
inds_to_keep: None or list of strs, optional
Index levels to preserve.
kwargs: dict, optional
Keyword arguments to pass to summary_df.
Returns
-------
df: MultiIndex DataFrame
See summary_df docstring for more details.
"""
# Need to pop include true values and add separately at the end as
# otherwise we get multiple true values added
include_true_values = kwargs.pop('include_true_values', False)
true_values = kwargs.get('true_values', None)
if inds_to_keep is None:
inds_to_keep = list(multi_in.index.names)[:-1]
if 'calculation type' not in inds_to_keep:
df = multi_in.groupby(inds_to_keep).apply(
summary_df, include_true_values=False, **kwargs)
else:
# If there is already a level called 'calculation type' in multi,
# summary_df will try making a second 'calculation type' index and (as
# of pandas v0.23.0) throw an error. Avoid this by renaming.
inds_to_keep = [lev if lev != 'calculation type' else
'calculation type temp' for lev in inds_to_keep]
multi_temp = copy.deepcopy(multi_in)
multi_temp.index.set_names(
[lev if lev != 'calculation type' else 'calculation type temp' for
lev in list(multi_temp.index.names)], inplace=True)
df = multi_temp.groupby(inds_to_keep).apply(
summary_df, include_true_values=False, **kwargs)
# add the 'calculation type' values ('mean' and 'std') produced by
# summary_df to the input calculation type names (now in level
# 'calculation type temp')
ind = (df.index.get_level_values('calculation type temp') + ' ' +
df.index.get_level_values('calculation type'))
order = list(df.index.names)
order.remove('calculation type temp')
df.index = df.index.droplevel(
['calculation type', 'calculation type temp'])
df['calculation type'] = list(ind)
df.set_index('calculation type', append=True, inplace=True)
df = df.reorder_levels(order)
if include_true_values:
assert true_values is not None
tv_ind = ['true values' if name == 'calculation type' else '' for
name in df.index.names[:-1]] + ['value']
df.loc[tuple(tv_ind), :] = true_values
return df
[docs]def summary_df(df_in, **kwargs):
"""Make a panda data frame of the mean and std devs of an array of results,
including the uncertainties on the values.
This is similar to pandas.DataFrame.describe but also includes estimates of
the numerical uncertainties.
The output DataFrame has multiindex levels:
'calculation type': mean and standard deviations of the data.
'result type': value and uncertainty for each quantity.
calculation type result type column_1 column_2 ...
mean value
mean uncertainty
std value
std uncertainty
Parameters
----------
df_in: pandas DataFrame
true_values: array
Analytical values if known for comparison with mean. Used to
calculate root mean squared errors (RMSE).
include_true_values: bool, optional
Whether or not to include true values in the output DataFrame.
include_rmse: bool, optional
Whether or not to include root-mean-squared-errors in the output
DataFrame.
Returns
-------
df: MultiIndex DataFrame
"""
true_values = kwargs.pop('true_values', None)
include_true_values = kwargs.pop('include_true_values', False)
include_rmse = kwargs.pop('include_rmse', False)
if kwargs:
raise TypeError('Unexpected **kwargs: {0}'.format(kwargs))
if true_values is not None:
assert true_values.shape[0] == df_in.shape[1], (
'There should be one true value for every column! '
'true_values.shape=' + str(true_values.shape) + ', '
'df_in.shape=' + str(df_in.shape))
# make the data frame
df = pd.DataFrame([df_in.mean(axis=0), df_in.std(axis=0, ddof=1)],
index=['mean', 'std'])
if include_true_values:
assert true_values is not None
df.loc['true values'] = true_values
# Make index categorical to allow sorting
df.index = pd.CategoricalIndex(df.index.values, ordered=True,
categories=['true values', 'mean', 'std',
'rmse'],
name='calculation type')
# add uncertainties
num_cals = df_in.shape[0]
mean_unc = df.loc['std'] / np.sqrt(num_cals)
std_unc = df.loc['std'] * np.sqrt(1 / (2 * (num_cals - 1)))
df['result type'] = pd.Categorical(['value'] * df.shape[0], ordered=True,
categories=['value', 'uncertainty'])
df.set_index(['result type'], drop=True, append=True, inplace=True)
df.loc[('mean', 'uncertainty'), :] = mean_unc.values
df.loc[('std', 'uncertainty'), :] = std_unc.values
if include_rmse:
assert true_values is not None, \
'Need to input true values for RMSE!'
rmse, rmse_unc = rmse_and_unc(df_in.values, true_values)
df.loc[('rmse', 'value'), :] = rmse
df.loc[('rmse', 'uncertainty'), :] = rmse_unc
# Ensure correct row order by sorting
df.sort_index(inplace=True)
# Cast calculation type index back from categorical to string to allow
# adding new calculation types
df.set_index(
[df.index.get_level_values('calculation type').astype(str),
df.index.get_level_values('result type')],
inplace=True)
return df
[docs]def efficiency_gain_df(method_names, method_values, est_names, **kwargs):
r"""Calculated data frame showing
.. math::
\mathrm{efficiency\,gain}
=
\frac{\mathrm{Var[base\,method]}}{\mathrm{Var[new\,method]}}
See the dynamic nested sampling paper (Higson et al. 2019) for more
details.
The standard method on which to base the gain is assumed to be the first
method input.
The output DataFrame will contain rows:
mean [dynamic goal]: mean calculation result for standard nested
sampling and dynamic nested sampling with each input dynamic
goal.
std [dynamic goal]: standard deviation of results for standard
nested sampling and dynamic nested sampling with each input
dynamic goal.
gain [dynamic goal]: the efficiency gain (computational speedup)
from dynamic nested sampling compared to standard nested
sampling. This equals (variance of standard results) /
(variance of dynamic results); see the dynamic nested
sampling paper for more details.
Parameters
----------
method names: list of strs
method values: list
Each element is a list of 1d arrays of results for the method. Each
array must have shape (len(est_names),).
est_names: list of strs
Provide column titles for output df.
true_values: iterable of same length as estimators list
True values of the estimators for the given likelihood and prior.
Returns
-------
results: pandas data frame
Results data frame.
"""
true_values = kwargs.pop('true_values', None)
include_true_values = kwargs.pop('include_true_values', False)
include_rmse = kwargs.pop('include_rmse', False)
adjust_nsamp = kwargs.pop('adjust_nsamp', None)
if kwargs:
raise TypeError('Unexpected **kwargs: {0}'.format(kwargs))
if adjust_nsamp is not None:
assert adjust_nsamp.shape == (len(method_names),)
assert len(method_names) == len(method_values)
df_dict = {}
for i, method_name in enumerate(method_names):
# Set include_true_values=False as we don't want them repeated for
# every method
df = summary_df_from_list(
method_values[i], est_names, true_values=true_values,
include_true_values=False, include_rmse=include_rmse)
if i != 0:
stats = ['std']
if include_rmse:
stats.append('rmse')
if adjust_nsamp is not None:
# Efficiency gain measures performance per number of
# samples (proportional to computational work). If the
# number of samples is not the same we can adjust this.
adjust = (adjust_nsamp[0] / adjust_nsamp[i])
else:
adjust = 1
for stat in stats:
# Calculate efficiency gain vs standard nested sampling
gain, gain_unc = get_eff_gain(
df_dict[method_names[0]].loc[(stat, 'value')],
df_dict[method_names[0]].loc[(stat, 'uncertainty')],
df.loc[(stat, 'value')],
df.loc[(stat, 'uncertainty')], adjust=adjust)
key = stat + ' efficiency gain'
df.loc[(key, 'value'), :] = gain
df.loc[(key, 'uncertainty'), :] = gain_unc
df_dict[method_name] = df
results = pd.concat(df_dict)
results.index.rename('dynamic settings', level=0, inplace=True)
new_ind = []
new_ind.append(pd.CategoricalIndex(
results.index.get_level_values('calculation type'), ordered=True,
categories=['true values', 'mean', 'std', 'rmse',
'std efficiency gain', 'rmse efficiency gain']))
new_ind.append(pd.CategoricalIndex(
results.index.get_level_values('dynamic settings'),
ordered=True, categories=[''] + method_names))
new_ind.append(results.index.get_level_values('result type'))
results.set_index(new_ind, inplace=True)
if include_true_values:
with warnings.catch_warnings():
# Performance not an issue here so suppress annoying warning
warnings.filterwarnings('ignore', message=(
'indexing past lexsort depth may impact performance.'))
results.loc[('true values', '', 'value'), :] = true_values
results.sort_index(inplace=True)
return results
# Helper functions
# ----------------
[docs]def get_eff_gain(base_std, base_std_unc, meth_std, meth_std_unc, adjust=1):
r"""Calculates efficiency gain for a new method compared to a base method.
Given the variation in repeated calculations' results using the two
methods, the efficiency gain is:
.. math::
\mathrm{efficiency\,gain}
=
\frac{\mathrm{Var[base\,method]}}{\mathrm{Var[new\,method]}}
The uncertainty on the efficiency gain is also calculated.
See the dynamic nested sampling paper (Higson et al. 2019) for more
details.
Parameters
----------
base_std: 1d numpy array
base_std_unc: 1d numpy array
Uncertainties on base_std.
meth_std: 1d numpy array
meth_std_unc: 1d numpy array
Uncertainties on base_std.
Returns
-------
gain: 1d numpy array
gain_unc: 1d numpy array
Uncertainties on gain.
"""
ratio = base_std / meth_std
ratio_unc = array_ratio_std(
base_std, base_std_unc, meth_std, meth_std_unc)
gain = ratio ** 2
gain_unc = 2 * ratio * ratio_unc
gain *= adjust
gain_unc *= adjust
return gain, gain_unc
[docs]def rmse_and_unc(values_array, true_values):
r"""Calculate the root meet squared error and its numerical uncertainty.
With a reasonably large number of values in values_list the uncertainty
on sq_errors should be approximately normal (from the central limit
theorem).
Uncertainties are calculated via error propagation: if :math:`\sigma`
is the error on :math:`X` then the error on :math:`\sqrt{X}`
is :math:`\frac{\sigma}{2 \sqrt{X}}`.
Parameters
----------
values_array: 2d numpy array
Array of results: each row corresponds to a different estimate of the
quantities considered.
true_values: 1d numpy array
Correct values for the quantities considered.
Returns
-------
rmse: 1d numpy array
Root-mean-squared-error for each quantity.
rmse_unc: 1d numpy array
Numerical uncertainties on each element of rmse.
"""
assert true_values.shape == (values_array.shape[1],)
errors = values_array - true_values[np.newaxis, :]
sq_errors = errors ** 2
sq_errors_mean = np.mean(sq_errors, axis=0)
sq_errors_mean_unc = (np.std(sq_errors, axis=0, ddof=1) /
np.sqrt(sq_errors.shape[0]))
rmse = np.sqrt(sq_errors_mean)
rmse_unc = 0.5 * (1 / rmse) * sq_errors_mean_unc
return rmse, rmse_unc
[docs]def array_ratio_std(values_n, sigmas_n, values_d, sigmas_d):
r"""Gives error on the ratio of 2 floats or 2 1-dimensional arrays given
their values and uncertainties. This assumes the covariance = 0, and that
the input uncertainties are small compared to the corresponding input
values. _n and _d denote the numerator and denominator respectively.
Parameters
----------
values_n: float or numpy array
Numerator values.
sigmas_n: float or numpy array
:math:`1\sigma` uncertainties on values_n.
values_d: float or numpy array
Denominator values.
sigmas_d: float or numpy array
:math:`1\sigma` uncertainties on values_d.
Returns
-------
std: float or numpy array
:math:`1\sigma` uncertainty on values_n / values_d.
"""
std = np.sqrt((sigmas_n / values_n) ** 2 + (sigmas_d / values_d) ** 2)
std *= (values_n / values_d)
return std