Source code for bootstrap_stat.distributions

# pyre-unsafe
"""Empirical distribution classes for bootstrap methods."""

from typing import Literal, overload

import numpy as np
import numpy.typing as npt
import pandas as pd

from bootstrap_stat._utils import ArrayLike, RNGLike, Statistic


[docs] class EmpiricalDistribution: r"""Empirical Distribution The Empirical Distribution puts probability 1/n on each of n observations. Parameters ---------- data : array_like or pandas DataFrame The data. rng : Generator, SeedSequence, int, or None, optional Source of randomness for sampling. Anything ``np.random.default_rng`` accepts. Pass an integer or pre-built ``Generator`` for reproducible bootstrap draws; pass ``None`` (default) for non-reproducible randomness seeded from system entropy. Attributes ---------- data : array_like or pandas DataFrame The underlying data. n : int Number of observations. is_multi_sample : bool Always False for this class. See Also -------- MultiSampleEmpiricalDistribution : Extension for two-sample and multi-sample problems. Notes ----- The bootstrap replaces the unknown distribution :math:`F` with its plug-in estimate :math:`\hat{F}`, the empirical distribution, which places probability :math:`1/n` on each of the :math:`n` observations. Drawing a sample of size :math:`n` from :math:`\hat{F}` with replacement is equivalent to drawing a bootstrap sample. All bootstrap methods in this library begin by constructing an :class:`EmpiricalDistribution` and passing it to an inference function such as :func:`~bootstrap_stat.bcanon_interval` or :func:`~bootstrap_stat.standard_error`. """ data: ArrayLike n: int is_multi_sample: bool _rng: np.random.Generator def __init__(self, data: ArrayLike, rng: RNGLike = None) -> None: self.data = data self.n = len(data) self.is_multi_sample = False self._rng = np.random.default_rng(rng) @overload def sample( self, size: int | None = None, return_indices: Literal[False] = False, reset_index: bool = True, ) -> npt.NDArray[np.float64] | pd.DataFrame: ... @overload def sample( self, size: int | None, return_indices: Literal[True], reset_index: bool = True, ) -> tuple[npt.NDArray[np.float64] | pd.DataFrame, npt.NDArray[np.intp]]: ...
[docs] def sample( self, size: int | None = None, return_indices: bool = False, reset_index: bool = True, ) -> ( npt.NDArray[np.float64] | pd.DataFrame | tuple[npt.NDArray[np.float64] | pd.DataFrame, npt.NDArray[np.intp]] ): """Sample from the empirical distribution Parameters ---------- size : int or tuple of ints, optional Output shape. If None (default), samples the same number of points as the original dataset. return_indices : boolean, optional If True, return the indices of the data points sampled. Defaults to False. reset_index : boolean, optional If True (default), reset the index. Applies only to data frames. This is usually what we would want to do, except for debugging perhaps. Returns ------- samples : ndarray or pandas DataFrame IID samples from the empirical distribution. ind : ndarray Indices of samples chosen. Only returned if `return_indices` is True. """ if size is None: s = self.n else: s = size if return_indices: ind = self._rng.choice(range(self.n), size=s, replace=True) try: samples = self.data[ind] except KeyError: samples = self.data.iloc[ind] if reset_index: samples.reset_index(drop=True, inplace=True) except TypeError: d = np.array(self.data) samples = d[ind] return samples, ind else: # Generator.choice (unlike the legacy np.random.choice) accepts # multi-dimensional arrays, so we explicitly route DataFrames / # Series through pandas to preserve the original return type. if isinstance(self.data, (pd.DataFrame, pd.Series)): samples = self.data.sample(s, replace=True, random_state=self._rng) if reset_index: samples.reset_index(drop=True, inplace=True) return samples return self._rng.choice(self.data, size=s, replace=True)
[docs] def calculate_parameter(self, t: Statistic) -> float: """Calculate a parameter of the distribution. Parameters ---------- t : function Function to be applied to dataset. If using an n-Sample Distribution, t should take as input a tuple of data sets of the appropriate size. Returns ------- tF : float Parameter of distribution. """ return t(self.data)
[docs] class MultiSampleEmpiricalDistribution(EmpiricalDistribution): r"""Multi-Sample Empirical Distribution Parameters ---------- datasets : tuple of arrays or pandas DataFrames. Observed data sets. rng : Generator, SeedSequence, int, or None, optional Source of randomness. A statistically-independent child stream is spawned for each underlying :class:`EmpiricalDistribution` so that the per-sample draws are reproducible from a single ``rng``. See Also -------- EmpiricalDistribution : Single-sample empirical distribution. Notes ----- Suppose we observe .. math:: x_i \sim F, i=1,...,m y_j \sim G, j=1,...,n Then :math:`P = (\hat{F}, \hat{G})` is the probabilistic mechanism consisting of the two empirical distributions :math:`F` and :math:`G`. Sampling from :math:`P` amounts to sampling :math:`m` points IID from :math:`\hat{F}`, and :math:`n` points IID from :math:`\hat{G}`. This is called the Two Sample Empirical Distribution, and comes up a lot in A/B testing. The generalization to more than two samples is obvious. Because of some of the implementation details of the bootstrap methods, we use tuples for everything. So when initializing, be sure to wrap the different datasets in a tuple. Samples will themselves be tuples, etc. See the function documentation below for details and examples. This is a relatively thin wrapper around the regular :class:`EmpiricalDistribution`: we just create a distinct :class:`EmpiricalDistribution` for each dataset, and use that for sampling. Examples -------- >>> data = [1, 2, 3] >>> dist = EmpiricalDistribution(data) >>> sample = dist.sample() >>> len(sample) 3 >>> data_a = [1, 2, 3] >>> data_b = [4, 5, 6] >>> data = (data_a, data_b) # Note tuple >>> dist = MultiSampleEmpiricalDistribution(data) >>> a, b = dist.sample() # Can de-tuple directly >>> len(a), len(b) (3, 3) >>> ab = dist.sample() # Or indirectly, which is often more useful >>> len(ab) 2 """ dists: list[EmpiricalDistribution] n: list[int] # type: ignore[assignment] def __init__(self, datasets: tuple[ArrayLike, ...], rng: RNGLike = None) -> None: self.data = datasets parent_rng = np.random.default_rng(rng) child_rngs = parent_rng.spawn(len(datasets)) self.dists = [ EmpiricalDistribution(d, rng=r) for d, r in zip(datasets, child_rngs) ] self.n = [len(d) for d in datasets] self.is_multi_sample = True self._rng = parent_rng
[docs] def sample( # type: ignore[override] self, size: tuple[int, ...] | list[int] | None = None ) -> tuple[npt.NDArray[np.float64] | pd.DataFrame, ...]: """Sample from the empirical distribution Parameters ---------- size : tuple of ints, optional Number of samples to be drawn from each EmpiricalDistribution. If None (default), samples the same numbers of points as the original datasets. Returns ------- samples : tuple of ndarray or pandas DataFrame IID samples from the empirical distributions. """ if size is None: s = self.n else: s = size samples = [d.sample(size=si) for d, si in zip(self.dists, s)] return (*samples,)
[docs] def calculate_parameter(self, t: Statistic) -> float: r"""Calculate a parameter of the distribution. Parameters ---------- t : function Function to be applied to dataset. Should take as input a tuple of data sets of the appropriate size. Returns ------- tF : float Parameter of distribution. Examples -------- Suppose we are in the Two-Sample case and have two empirical distributions, :math:`\hat{F}` and :math:`\hat{G}`, and we want to calculate the difference in means of these distributions. We might do something like: >>> data_a = [1, 2, 3] >>> data_b = [4, 5, 6] >>> data = (data_a, data_b) # Note tuple >>> dist = MultiSampleEmpiricalDistribution(data) >>> def parameter(ab): ... a, b = ab # Note de-tupling ... return np.mean(b) - np.mean(a) >>> float(dist.calculate_parameter(parameter)) 3.0 """ return t(self.data)