# pyre-unsafe
"""Empirical distribution classes for bootstrap methods."""
from typing import Literal, overload
import numpy as np
import numpy.typing as npt
import pandas as pd
from bootstrap_stat._utils import ArrayLike, RNGLike, Statistic
[docs]
class EmpiricalDistribution:
r"""Empirical Distribution
The Empirical Distribution puts probability 1/n on each of n
observations.
Parameters
----------
data : array_like or pandas DataFrame
The data.
rng : Generator, SeedSequence, int, or None, optional
Source of randomness for sampling. Anything ``np.random.default_rng``
accepts. Pass an integer or pre-built ``Generator`` for reproducible
bootstrap draws; pass ``None`` (default) for non-reproducible
randomness seeded from system entropy.
Attributes
----------
data : array_like or pandas DataFrame
The underlying data.
n : int
Number of observations.
is_multi_sample : bool
Always False for this class.
See Also
--------
MultiSampleEmpiricalDistribution : Extension for two-sample and
multi-sample problems.
Notes
-----
The bootstrap replaces the unknown distribution :math:`F` with its
plug-in estimate :math:`\hat{F}`, the empirical distribution, which
places probability :math:`1/n` on each of the :math:`n`
observations. Drawing a sample of size :math:`n` from
:math:`\hat{F}` with replacement is equivalent to drawing a
bootstrap sample.
All bootstrap methods in this library begin by constructing an
:class:`EmpiricalDistribution` and passing it to an inference
function such as :func:`~bootstrap_stat.bcanon_interval` or
:func:`~bootstrap_stat.standard_error`.
"""
data: ArrayLike
n: int
is_multi_sample: bool
_rng: np.random.Generator
def __init__(self, data: ArrayLike, rng: RNGLike = None) -> None:
self.data = data
self.n = len(data)
self.is_multi_sample = False
self._rng = np.random.default_rng(rng)
@overload
def sample(
self,
size: int | None = None,
return_indices: Literal[False] = False,
reset_index: bool = True,
) -> npt.NDArray[np.float64] | pd.DataFrame: ...
@overload
def sample(
self,
size: int | None,
return_indices: Literal[True],
reset_index: bool = True,
) -> tuple[npt.NDArray[np.float64] | pd.DataFrame, npt.NDArray[np.intp]]: ...
[docs]
def sample(
self,
size: int | None = None,
return_indices: bool = False,
reset_index: bool = True,
) -> (
npt.NDArray[np.float64]
| pd.DataFrame
| tuple[npt.NDArray[np.float64] | pd.DataFrame, npt.NDArray[np.intp]]
):
"""Sample from the empirical distribution
Parameters
----------
size : int or tuple of ints, optional
Output shape. If None (default), samples the same number
of points as the original dataset.
return_indices : boolean, optional
If True, return the indices of the data points
sampled. Defaults to False.
reset_index : boolean, optional
If True (default), reset the index. Applies only to data
frames. This is usually what we would want to do, except
for debugging perhaps.
Returns
-------
samples : ndarray or pandas DataFrame
IID samples from the empirical distribution.
ind : ndarray
Indices of samples chosen. Only returned if
`return_indices` is True.
"""
if size is None:
s = self.n
else:
s = size
if return_indices:
ind = self._rng.choice(range(self.n), size=s, replace=True)
try:
samples = self.data[ind]
except KeyError:
samples = self.data.iloc[ind]
if reset_index:
samples.reset_index(drop=True, inplace=True)
except TypeError:
d = np.array(self.data)
samples = d[ind]
return samples, ind
else:
# Generator.choice (unlike the legacy np.random.choice) accepts
# multi-dimensional arrays, so we explicitly route DataFrames /
# Series through pandas to preserve the original return type.
if isinstance(self.data, (pd.DataFrame, pd.Series)):
samples = self.data.sample(s, replace=True, random_state=self._rng)
if reset_index:
samples.reset_index(drop=True, inplace=True)
return samples
return self._rng.choice(self.data, size=s, replace=True)
[docs]
def calculate_parameter(self, t: Statistic) -> float:
"""Calculate a parameter of the distribution.
Parameters
----------
t : function
Function to be applied to dataset. If using an n-Sample
Distribution, t should take as input a tuple of data sets
of the appropriate size.
Returns
-------
tF : float
Parameter of distribution.
"""
return t(self.data)
[docs]
class MultiSampleEmpiricalDistribution(EmpiricalDistribution):
r"""Multi-Sample Empirical Distribution
Parameters
----------
datasets : tuple of arrays or pandas DataFrames.
Observed data sets.
rng : Generator, SeedSequence, int, or None, optional
Source of randomness. A statistically-independent child stream is
spawned for each underlying :class:`EmpiricalDistribution` so that
the per-sample draws are reproducible from a single ``rng``.
See Also
--------
EmpiricalDistribution : Single-sample empirical distribution.
Notes
-----
Suppose we observe
.. math::
x_i \sim F, i=1,...,m
y_j \sim G, j=1,...,n
Then :math:`P = (\hat{F}, \hat{G})` is the probabilistic mechanism
consisting of the two empirical distributions :math:`F` and
:math:`G`. Sampling from :math:`P` amounts to sampling :math:`m`
points IID from :math:`\hat{F}`, and :math:`n` points IID from
:math:`\hat{G}`. This is called the Two Sample Empirical
Distribution, and comes up a lot in A/B testing. The
generalization to more than two samples is obvious.
Because of some of the implementation details of the bootstrap
methods, we use tuples for everything. So when initializing, be
sure to wrap the different datasets in a tuple. Samples will
themselves be tuples, etc. See the function documentation below
for details and examples.
This is a relatively thin wrapper around the regular
:class:`EmpiricalDistribution`: we just create a distinct
:class:`EmpiricalDistribution` for each dataset, and use that for
sampling.
Examples
--------
>>> data = [1, 2, 3]
>>> dist = EmpiricalDistribution(data)
>>> sample = dist.sample()
>>> len(sample)
3
>>> data_a = [1, 2, 3]
>>> data_b = [4, 5, 6]
>>> data = (data_a, data_b) # Note tuple
>>> dist = MultiSampleEmpiricalDistribution(data)
>>> a, b = dist.sample() # Can de-tuple directly
>>> len(a), len(b)
(3, 3)
>>> ab = dist.sample() # Or indirectly, which is often more useful
>>> len(ab)
2
"""
dists: list[EmpiricalDistribution]
n: list[int] # type: ignore[assignment]
def __init__(self, datasets: tuple[ArrayLike, ...], rng: RNGLike = None) -> None:
self.data = datasets
parent_rng = np.random.default_rng(rng)
child_rngs = parent_rng.spawn(len(datasets))
self.dists = [
EmpiricalDistribution(d, rng=r) for d, r in zip(datasets, child_rngs)
]
self.n = [len(d) for d in datasets]
self.is_multi_sample = True
self._rng = parent_rng
[docs]
def sample( # type: ignore[override]
self, size: tuple[int, ...] | list[int] | None = None
) -> tuple[npt.NDArray[np.float64] | pd.DataFrame, ...]:
"""Sample from the empirical distribution
Parameters
----------
size : tuple of ints, optional
Number of samples to be drawn from each
EmpiricalDistribution. If None (default), samples the same
numbers of points as the original datasets.
Returns
-------
samples : tuple of ndarray or pandas DataFrame
IID samples from the empirical distributions.
"""
if size is None:
s = self.n
else:
s = size
samples = [d.sample(size=si) for d, si in zip(self.dists, s)]
return (*samples,)
[docs]
def calculate_parameter(self, t: Statistic) -> float:
r"""Calculate a parameter of the distribution.
Parameters
----------
t : function
Function to be applied to dataset. Should take as input a
tuple of data sets of the appropriate size.
Returns
-------
tF : float
Parameter of distribution.
Examples
--------
Suppose we are in the Two-Sample case and have two empirical
distributions, :math:`\hat{F}` and :math:`\hat{G}`, and we
want to calculate the difference in means of these
distributions. We might do something like:
>>> data_a = [1, 2, 3]
>>> data_b = [4, 5, 6]
>>> data = (data_a, data_b) # Note tuple
>>> dist = MultiSampleEmpiricalDistribution(data)
>>> def parameter(ab):
... a, b = ab # Note de-tupling
... return np.mean(b) - np.mean(a)
>>> float(dist.calculate_parameter(parameter))
3.0
"""
return t(self.data)