diff --git a/README.rst b/README.rst index 53e3bbee..4ecb81c2 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,7 @@ anesthetic: nested sampling post-processing =========================================== :Authors: Will Handley and Lukas Hergt -:Version: 2.8.12 +:Version: 2.9.0 :Homepage: https://github.com/handley-lab/anesthetic :Documentation: http://anesthetic.readthedocs.io/ diff --git a/anesthetic/_version.py b/anesthetic/_version.py index 2819820c..387cfacc 100644 --- a/anesthetic/_version.py +++ b/anesthetic/_version.py @@ -1 +1 @@ -__version__ = '2.8.12' +__version__ = '2.9.0' diff --git a/anesthetic/utils.py b/anesthetic/utils.py index efe1c80c..aa8345d4 100644 --- a/anesthetic/utils.py +++ b/anesthetic/utils.py @@ -3,6 +3,7 @@ import pandas from scipy import special from scipy.interpolate import interp1d +from scipy.optimize import minimize_scalar from scipy.stats import kstwobign, entropy from matplotlib.tri import Triangulation import contextlib @@ -151,6 +152,130 @@ def quantile(a, q, w=None, interpolation='linear'): return quant +def sample_cdf(samples, inverse=False, interpolation='linear'): + """Sample the empirical cdf for a 1d array.""" + samples = np.sort(samples) + ngaps = len(samples)-1 + gaps = np.random.dirichlet(np.ones(ngaps)) + cdf = np.array([0, *np.cumsum(gaps)]) + # The last element should be one (up to floating point errors) because + # dirichlet sums to one. Set exactly to avoid interpolation errors. + cdf[-1] = 1 + if inverse: + return interp1d(cdf, samples, kind=interpolation) + else: + return interp1d(samples, cdf, kind=interpolation, + fill_value=(0, 1), bounds_error=False) + + +def credibility_interval(samples, weights=None, level=0.68, method="iso-pdf", + return_covariance=False, nsamples=12): + """Compute the credibility interval of weighted samples. + + Based on linear interpolation of the cumulative density function, thus + expect discretisation errors on the scale of distances between samples. + + https://github.com/Stefan-Heimersheim/fastCI#readme + + Parameters + ---------- + samples : array + Samples to compute the credibility interval of. + weights : array, default=np.ones_like(samples) + Weights corresponding to samples. + level : float, default=0.68 + Credibility level (probability, <1). + method : str, default='iso-pdf' + Which definition of interval to use: + + * ``'iso-pdf'``: Calculate iso probability density interval with the + same probability density at each end. Also known as + waterline-interval or highest average posterior density interval. + This is only accurate if the distribution is sufficiently uni-modal. + * ``'lower-limit'``/``'upper-limit'``: Lower/upper limit. One-sided + limits for which ``level`` fraction of the (equally weighted) samples + lie above/below the limit. + * ``'equal-tailed'``: Equal-tailed interval with the same fraction of + (equally weighted) samples below and above the interval region. + + return_covariance: bool, default=False + Return the covariance of the sampled limits, in addition to the mean + nsamples : int, default=12 + Number of CDF samples to improve `mean` and `std` estimate. + + Returns + ------- + limit(s) : float, array, or tuple of floats or arrays + Returns the credibility interval boundari(es). By default, + returns the mean over ``nsamples`` samples, which is either + two numbers (``method='iso-pdf'``/``'equal-tailed'``) or one number + (``method='lower-limit'``/``'upper-limit'``). If + ``return_covariance=True``, returns a tuple (mean(s), covariance) + where covariance is the covariance over the sampled limits. + """ + if level >= 1: + raise ValueError('level must be <1, got {0:.2f}'.format(level)) + if len(np.shape(samples)) != 1: + raise ValueError('Support only 1D arrays for samples') + if weights is not None and np.shape(samples) != np.shape(weights): + raise ValueError('Shape of samples and weights differs') + + # Convert to numpy to unify indexing + samples = np.array(samples.copy()) + if weights is None: + weights = np.ones(len(samples)) + else: + weights = np.array(weights.copy()) + + # Convert samples to unit weight not the case + if not np.all(np.logical_or(weights == 0, weights == 1)): + # compress_weights with ncompress='equal' assures weights \in 0,1 + # Note that this must be done, we cannot handle weights != 1 + # see this discussion for details: + # https://github.com/williamjameshandley/anesthetic/pull/188#issuecomment-1274980982 + weights = compress_weights(weights, ncompress='equal') + + indices = np.where(weights)[0] + x = samples[indices] + + # Sample the confidence interval multiple times + # to get errorbars on confidence interval boundaries + ci_samples = [] + for i in range(nsamples): + invCDF = sample_cdf(x, inverse=True) + if method == 'iso-pdf': + # Find smallest interval + def distance(Y, level=level): + return invCDF(Y+level)-invCDF(Y) + res = minimize_scalar(distance, bounds=(0, 1-level), + method="Bounded") + ci_samples.append(np.array([invCDF(res.x), + invCDF(res.x+level)])) + elif method == 'lower-limit': + # Get value from which we reach the desired level + ci_samples.append(invCDF(1-level)) + elif method == 'upper-limit': + # Get value to which we reach the desired level + ci_samples.append(invCDF(level)) + elif method == 'equal-tailed': + ci_samples.append(np.array([invCDF((1-level)/2), + invCDF((1+level)/2)])) + else: + raise ValueError("Method '{0:}' unknown".format(method)) + ci_samples = np.array(ci_samples) + if np.shape(ci_samples) == (nsamples, ): + if return_covariance: + return np.mean(ci_samples), np.cov(ci_samples) + else: + return np.mean(ci_samples) + else: + if return_covariance: + return np.mean(ci_samples, axis=0), \ + np.cov(ci_samples, rowvar=False) + else: + return np.mean(ci_samples, axis=0) + + def mirror_1d(d, xmin=None, xmax=None): """If necessary apply reflecting boundary conditions.""" if xmin is not None and xmax is not None: diff --git a/anesthetic/weighted_pandas.py b/anesthetic/weighted_pandas.py index 78f6b9d7..0a4b2600 100644 --- a/anesthetic/weighted_pandas.py +++ b/anesthetic/weighted_pandas.py @@ -3,16 +3,17 @@ import warnings from inspect import signature import numpy as np +from numpy.ma import masked_array from pandas import Series, DataFrame, concat, MultiIndex from pandas.core.groupby import GroupBy, SeriesGroupBy, DataFrameGroupBy, ops from pandas._libs import lib from pandas._libs.lib import no_default from pandas.util._exceptions import find_stack_level from pandas.util import hash_pandas_object -from numpy.ma import masked_array -from anesthetic.utils import (compress_weights, neff, quantile, - temporary_seed, adjust_docstrings) from pandas.core.dtypes.missing import notna +from anesthetic.utils import (compress_weights, neff, quantile, + temporary_seed, adjust_docstrings, + credibility_interval) from pandas.core.accessor import CachedAccessor from anesthetic.plotting import PlotAccessor import pandas as pd @@ -373,6 +374,55 @@ def compress(self, ncompress=True): def sample(self, *args, **kwargs): # noqa: D102 return super().sample(weights=self.get_weights(), *args, **kwargs) + def credibility_interval(self, level=0.68, method="iso-pdf", + return_covariance=False, nsamples=12): + """Compute the credibility interval of the weighted samples. + + Based on linear interpolation of the cumulative density function, thus + expect discretisation errors on the scale of distances between samples. + + https://github.com/Stefan-Heimersheim/fastCI#readme + + Parameters + ---------- + level : float, default=0.68 + Credibility level (probability, <1). + method : str, default='iso-pdf' + Which definition of interval to use: + + * ``'iso-pdf'``: Calculate iso probability density interval with + the same probability density at each end. Also known as + waterline-interval or highest average posterior density interval. + This is only accurate if the distribution is sufficiently + uni-modal. + * ``'lower-limit'``/``'upper-limit'``: Lower/upper limit. One-sided + limits for which ``level`` fraction of the (equally weighted) + samples lie above/below the limit. + * ``'equal-tailed'``: Equal-tailed interval with the same fraction + of (equally weighted) samples below and above the interval + region. + + return_covariance: bool, default=False + Return the covariance of the sampled limits, in addition to the + mean + nsamples : int, default=12 + Number of CDF samples to improve `mean` and `std` estimate. + + Returns + ------- + limit(s) : float, array, or tuple of floats or arrays + Returns the credibility interval boundaries of the Series. + By default, returns the mean over ``nsamples`` samples, which is + either two numbers (``method='iso-pdf'``/``'equal-tailed'``) or + one number (``method='lower-limit'``/``'upper-limit'``). If + ``return_covariance=True``, returns a tuple (mean(s), covariance) + where covariance is the covariance over the sampled limits. + """ + return credibility_interval(self, weights=self.get_weights(), + level=level, method=method, + return_covariance=return_covariance, + nsamples=nsamples) + @property def _constructor(self): return WeightedSeries @@ -628,6 +678,78 @@ def sample(self, *args, **kwargs): # noqa: D102 else: return super().sample(*args, **kwargs) + def credibility_interval(self, level=0.68, method="iso-pdf", + return_covariance=False, nsamples=12): + """Compute the credibility interval of the weighted samples. + + Based on linear interpolation of the cumulative density function, thus + expect discretisation errors on the scale of distances between samples. + + https://github.com/Stefan-Heimersheim/fastCI#readme + + Parameters + ---------- + level : float, default=0.68 + Credibility level (probability, <1). + method : str, default='iso-pdf' + Which definition of interval to use: + + * ``'iso-pdf'``: Calculate iso probability density interval with + the same probability density at each end. Also known as + waterline-interval or highest average posterior density interval. + This is only accurate if the distribution is sufficiently + uni-modal. + * ``'lower-limit'``/``'upper-limit'``: Lower/upper limit. One-sided + limits for which ``level`` fraction of the (equally weighted) + samples lie above/below the limit. + * ``'equal-tailed'``: Equal-tailed interval with the same fraction + of (equally weighted) samples below and above the interval + region. + + return_covariance: bool, default=False + Return the covariance of the sampled limits, in addition to the + mean + nsamples : int, default=12 + Number of CDF samples to improve `mean` and `std` estimate. + + Returns + ------- + limit(s) : float, array, or tuple of floats or arrays + Returns the credibility interval boundaries for each column. + By default, returns the mean over ``nsamples`` samples, which is + either two numbers (``method='iso-pdf'``/``'equal-tailed'``) or + one number (``method='lower-limit'``/``'upper-limit'``). If + ``return_covariance=True``, returns a tuple (means, covariances) + where covariances are the covariance over the sampled limits for + each column. + """ + if 'lower' in method: + limits = ['lower'] + elif 'upper' in method: + limits = ['upper'] + else: + limits = ['lower', 'upper'] + cis = [credibility_interval(self[col], weights=self.get_weights(), + level=level, method=method, + return_covariance=return_covariance, + nsamples=nsamples) for col in self.columns] + if return_covariance: + cis, covs = zip(*cis) + mulidx = MultiIndex.from_product([ + self.columns.get_level_values(level=0), + limits + ]) + ncol = len(self.columns) + nlim = len(limits) + covs = np.asarray(covs).reshape(nlim*ncol, nlim).T + covs = DataFrame(covs, index=limits, columns=mulidx) + cis = np.atleast_2d(cis) if 'limit' in method else np.asarray(cis).T + cis = DataFrame(data=cis, index=limits, columns=self.columns) + if return_covariance: + return cis, covs + else: + return cis + @property def _constructor_sliced(self): return WeightedSeries diff --git a/tests/test_samples.py b/tests/test_samples.py index f49aed3d..39340e0f 100644 --- a/tests/test_samples.py +++ b/tests/test_samples.py @@ -2001,3 +2001,36 @@ def test_axes_limits_2d(kind, kwargs): assert 3 < xmax < 3.9 assert -3.9 < ymin < -3 assert 3 < ymax < 3.9 + + +def test_credibility_interval(): + np.random.seed(0) + pc = read_chains('./tests/example_data/pc') + + ci, cov = pc.x0.credibility_interval(level=0.68, + method="iso-pdf", + return_covariance=True) + assert ci[0] == pytest.approx(-0.1, rel=0.01, abs=0.01) + assert ci[1] == pytest.approx(+0.1, rel=0.01, abs=0.01) + assert np.all(np.abs(cov) < 1e-3) + assert np.shape(ci) == (2,) + assert np.shape(cov) == (2, 2) + + params = ['x0', 'x1'] + ci, cov = pc[params].credibility_interval(level=0.68, + method="iso-pdf", + return_covariance=True) + assert_allclose(ci.loc['lower'], -0.1, rtol=0.01, atol=0.01) + assert_allclose(ci.loc['upper'], +0.1, rtol=0.01, atol=0.01) + assert np.all(np.abs(cov) < 1e-3) + assert ci.shape == (2, len(params)) + assert cov.shape == (2, 2 * len(params)) + ci, cov = pc[params].credibility_interval(level=0.95+0.025, + method='lower-limit', + return_covariance=True) + assert_allclose(ci, -0.2, rtol=0.01, atol=0.025) + assert np.all(np.abs(cov) < 1e-3) + assert cov.shape == (1, len(params)) + ci = pc[params].credibility_interval(level=0.95+0.025, + method='upper-limit') + assert_allclose(ci, +0.2, rtol=0.01, atol=0.025) diff --git a/tests/test_utils.py b/tests/test_utils.py index 24196c25..7a948a72 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import numpy as np import pytest from scipy import special as sp -from numpy.testing import assert_array_equal +from numpy.testing import (assert_array_equal, assert_allclose) from anesthetic import read_chains from pytest import approx from anesthetic.utils import (nest_level, compute_nlive, unique, is_int, @@ -11,7 +11,7 @@ logsumexp, sample_compression_1d, triangular_sample_compression_2d, insertion_p_value, compress_weights, - neff) + credibility_interval, sample_cdf, neff) def test_compress_weights(): @@ -22,6 +22,8 @@ def test_compress_weights(): r = np.random.rand(10) w = compress_weights(w=r, u=None, ncompress=False) assert_array_equal(w, r) + w = compress_weights(w=r, u=None, ncompress='equal') + assert np.all(np.logical_or(w == 0, w == 1)) # TODO Remove in 2.1 with pytest.raises(ValueError): @@ -192,6 +194,52 @@ def test_p_values_from_sample(): assert ks_results['p-value'] > 0.05 +def test_sample_cdf(): + normal_samples = np.random.normal(loc=2, scale=0.1, size=10000) + CDF = sample_cdf(normal_samples, inverse=False) + assert_allclose(CDF(-np.inf), 0) + assert_allclose(CDF(-100), 0) + assert_allclose(CDF(2), 0.5, atol=0.1) + + +@pytest.mark.parametrize('method', ['iso-pdf', 'equal-tailed', + 'lower-limit', 'upper-limit']) +@pytest.mark.parametrize('return_covariance', [True, False]) +def test_credibility_interval(method, return_covariance): + np.random.seed(0) + normal_samples = np.random.normal(loc=2, scale=0.1, size=10000) + ci = credibility_interval( + normal_samples, + level=0.84 if 'limit' in method else 0.68, + method=method, + return_covariance=return_covariance, + ) + if return_covariance: + ci, cov = ci + assert np.ndim(cov) == 0 if 'limit' in method else 2 + assert np.all(np.abs(cov) < 1e-4) + assert np.all(np.abs(cov) > 1e-8) + if method in ['iso-pdf', 'equal-tailed']: + lower, upper = ci + else: + upper = ci if 'upper' in method else 2.1 + lower = ci if 'lower' in method else 1.9 + assert_allclose(lower, 1.9, atol=0.02) + assert_allclose(upper, 2.1, atol=0.02) + + +def test_credibility_interval_exceptions(): + samples = read_chains('./tests/example_data/pc') + with pytest.raises(ValueError): + credibility_interval(samples.x0, level=1.1) + with pytest.raises(ValueError): + credibility_interval(samples[['x0', 'x1']]) + with pytest.raises(ValueError): + credibility_interval(samples.x0, weights=[1, 2, 3]) + with pytest.raises(ValueError): + credibility_interval(samples.x0, method='foo') + + @pytest.mark.parametrize('beta', ['entropy', 'kish', 0.5, 1, 2, np.inf, '0.5', '1', '1.', '1.0', '1.00', 'inf']) def test_effective_samples(beta):