# pyre-unsafe
"""Bias estimation and correction methods."""
import multiprocessing as mp
from collections.abc import Callable
from typing import Any, Literal
import numpy as np
import numpy.typing as npt
from pathos.multiprocessing import ProcessPool as Pool
from bootstrap_stat._utils import (
ArrayLike,
RNGLike,
Statistic,
_resampling_vector,
_spawn_rngs,
)
from bootstrap_stat.distributions import EmpiricalDistribution
from bootstrap_stat.sampling import bootstrap_samples, jackknife_values
[docs]
def bias(
dist: EmpiricalDistribution,
stat: Statistic,
t: Statistic,
B: int = 200,
return_samples: bool = False,
theta_star: npt.NDArray[np.float64] | None = None,
num_threads: int = 1,
rng: RNGLike = None,
) -> float | tuple[float, npt.NDArray[np.float64]]:
r"""Estimate of bias
Parameters
----------
dist : EmpiricalDistribution
The empirical distribution.
stat : function
The statistic for which we wish to calculate the bias.
t : function
Function to be applied to empirical distribution function.
B : int, optional
Number of bootstrap samples. Defaults to 200.
return_samples : boolean, optional
If True, return the bootstrapped statistic values. Defaults to False.
theta_star : array_like, optional
Bootstrapped statistic values. Can be passed if they have
already been calculated, which will speed this up
considerably.
num_threads : int, optional
Number of threads to use for multicore processing. Defaults to
1, meaning all calculations will be done in a single
thread. Set to -1 to use all available cores.
Returns
-------
bias : float
Estimate of bias.
theta_star : ndarray
Array of bootstrapped statistic values. Only returned if
`return_samples` is True.
See Also
--------
jackknife_bias : Jackknife estimate of bias.
better_bootstrap_bias : Bootstrap bias estimate with faster
convergence for plug-in statistics.
bias_corrected : Bias-corrected estimator.
Notes
-----
The bootstrap bias estimate is
.. math::
\widehat{\mathrm{bias}} = \bar{\theta}^* - t(\hat{F}),
where :math:`\bar{\theta}^* = B^{-1}\sum_{b=1}^{B}\mathrm{stat}(x^*_b)`
is the mean over bootstrap samples and :math:`t(\hat{F})` is the
plug-in value of the parameter under the empirical distribution.
For a smooth plug-in statistic :math:`t(\hat{F}) = \mathrm{stat}(x)`,
but the two arguments allow the caller to supply a different plug-in
value when needed. See [ET93]_ (S10.2) for details.
For plug-in statistics, :func:`better_bootstrap_bias` converges
faster using the same number of bootstrap samples. For non-smooth
statistics, :func:`jackknife_bias` is an alternative.
"""
if theta_star is None:
theta_star = bootstrap_samples(dist, stat, B, num_threads=num_threads, rng=rng)
tF_hat = dist.calculate_parameter(t)
bias_est = np.mean(theta_star) - tF_hat
if return_samples:
return bias_est, theta_star
else:
return bias_est
[docs]
def better_bootstrap_bias(
x: ArrayLike,
stat: Callable[[Any, npt.NDArray[np.float64]], float],
B: int = 400,
return_samples: bool = False,
num_threads: int = 1,
rng: RNGLike = None,
) -> float | tuple[float, npt.NDArray[np.float64]]:
r"""Better bootstrap bias.
Parameters
----------
x : array_like or pandas DataFrame
The data.
stat : function
The statistic. Should take `x` and a resampling vector as
input. See Notes.
B : int, optional
Number of bootstrap samples. Defaults to 400.
return_samples : boolean, optional
If True, return the bootstrapped statistic values. Defaults to False.
num_threads : int, optional
Number of threads to use for multicore processing. Defaults to
1, meaning all calculations will be done in a single
thread. Set to -1 to use all available cores.
Returns
-------
bias : float
Estimate of bias.
theta_star : ndarray
Array of bootstrapped statistic values. Only returned if
`return_samples` is True.
See Also
--------
bias : General bootstrap estimate of bias.
jackknife_bias : Jackknife estimate of bias.
bias_corrected : Bias-corrected estimator.
Notes
-----
The "better" bootstrap bias estimate is only applicable when
`stat` is a plug-in statistic for the parameter being estimated,
that is, having the form :math:`t(\hat{F})`, where :math:`\hat{F}`
is the empirical distribution. Notable situations where this
assumption does not hold include robust statistics like using the
alpha-trimmed mean, since that is not the plug-in statistic for
the mean. In cases like that, just use the "worse" :func:`bias`
function. The advantage of the "better" bootstrap bias estimate is
faster convergence. Whereas using `B` = 400 is typically adequate
here, it can take thousands of bootstrap samples to give an
accurate estimate in the "worse" :func:`bias` function.
"""
if num_threads == -1:
num_threads = mp.cpu_count()
parent_rng = np.random.default_rng(rng)
def _bootstrap_sim(x, stat, batch_size, worker_rng):
n = len(x)
sum_p = np.zeros((n,))
theta_star = np.zeros((batch_size,))
for i in range(batch_size):
p = _resampling_vector(n, rng=worker_rng)
sum_p += p
theta_star[i] = stat(x, p)
return theta_star, sum_p
if num_threads == 1:
theta_star, sum_p = _bootstrap_sim(x, stat, B, parent_rng)
else:
pool = Pool(num_threads)
try:
pool.restart()
except AssertionError:
pass
results = []
batch_size = B // num_threads
extra = B % num_threads
batch_sizes = [batch_size] * num_threads
for i in range(extra):
batch_sizes[i] += 1
worker_rngs = _spawn_rngs(parent_rng, num_threads)
for i, worker_rng in enumerate(worker_rngs):
r = pool.apipe(_bootstrap_sim, x, stat, batch_sizes[i], worker_rng)
results.append(r)
theta_star = []
sum_p = np.zeros((len(x),))
for res in results:
theta_star_i, sum_p_i = res.get()
theta_star.append(theta_star_i)
sum_p += sum_p_i
theta_star = np.hstack([t for t in theta_star])
pool.close()
pool.join()
bias_est = np.mean(theta_star) - stat(x, sum_p / len(theta_star))
if return_samples:
return bias_est, theta_star
else:
return bias_est
[docs]
def jackknife_bias(
x: ArrayLike,
stat: Statistic,
return_samples: bool = False,
jv: npt.NDArray[np.float64] | None = None,
num_threads: int = 1,
) -> float | tuple[float, npt.NDArray[np.float64]]:
r"""Jackknife estimate of bias.
Parameters
----------
x : array_like or pandas DataFrame
The data.
stat : function
The statistic.
return_samples : boolean, optional
If True, return the jackknife values. Defaults to False.
jv : array_like, optional
Jackknife values. Can be passed if they have already been
calculated, which will speed this up considerably.
num_threads : int, optional
Number of threads to use for multicore processing. Defaults to
1, meaning all calculations will be done in a single
thread. Set to -1 to use all available cores.
Returns
-------
bias : float
Estimate of bias.
jv : ndarray
Jackknife values. Only returned if `return_samples` is True.
See Also
--------
bias : Bootstrap estimate of bias.
better_bootstrap_bias : Bootstrap bias estimate with faster
convergence for plug-in statistics.
bias_corrected : Bias-corrected estimator.
Notes
-----
The jackknife estimate of bias is only applicable when `stat` is a
plug-in statistic, that is, having the form :math:`t(\hat{F})`,
where :math:`\hat{F}` is the empirical distribution. Moreover, it
is only applicable when `t` is a smooth function. Notable
exceptions include the median. The jackknife cannot be used to
estimate the bias of non-smooth estimators. See [ET93]_ (S10.5) for
details.
"""
if jv is None:
jv = jackknife_values(x, stat, num_threads=num_threads)
n = len(jv)
bias_est = (n - 1) * (np.mean(jv) - stat(x))
if return_samples:
return bias_est, jv
else:
return bias_est
[docs]
def bias_corrected(
x: ArrayLike,
stat: Statistic | Callable[[Any, npt.NDArray[np.float64]], float],
method: Literal[
"better_bootstrap_bias", "bias", "jackknife"
] = "better_bootstrap_bias",
dist: EmpiricalDistribution | None = None,
t: Statistic | None = None,
B: int | None = None,
return_samples: bool = False,
theta_star: npt.NDArray[np.float64] | None = None,
jv: npt.NDArray[np.float64] | None = None,
num_threads: int = 1,
rng: RNGLike = None,
) -> float | tuple[float, npt.NDArray[np.float64]]:
"""Bias-corrected estimator.
Parameters
----------
x : array_like or pandas DataFrame
The data.
stat : function
The statistic. For use with "bias" and "jackknife" methods,
should take a dataset as the input. For use with the
"better_bootstrap_bias" method, it should take the dataset and
a resampling vector as input. See the documentation in the
better_bootstrap_bias function for details.
method : ["better_bootstrap_bias", "bias", "jackknife"]
The method by which we correct for bias. Defaults to
"better_bootstrap_bias".
dist : EmpiricalDistribution
The empirical distribution. Required when method == "bias".
t : function
Function to be applied to empirical distribution
function. Required when method == "bias".
B : int, optional
Number of bootstrap samples. Required when method == "bias" or
"better_bootstrap_bias". Defaults to 400 when method ==
"better_bootstrap_bias" or 4000 when method == "bias".
return_samples : boolean, optional
If True, return the bootstrapped statistic or jackknife
values. Defaults to False.
theta_star : array_like, optional
Bootstrapped statistic values. Can be passed if they have
already been calculated, which will speed this up
considerably. Only used when method == "bias".
jv : array_like, optional
Jackknife values. Can be passed if they have already been
calculated, which will speed this up considerably. Only used
when method == "jackknife".
num_threads : int, optional
Number of threads to use for multicore processing. Defaults to
1, meaning all calculations will be done in a single
thread. Set to -1 to use all available cores.
Returns
-------
theta_bar : float
The bias-corrected estimator.
theta_star : ndarray
Array of bootstrapped statistic values. Only returned if
`return_samples` is True and method is either "bias" or
"better_bootstrap_bias".
jv : ndarray
Jackknife values. Only returned if `return_samples` is True
and method == "jackknife".
See Also
--------
bias : Bootstrap estimate of bias.
jackknife_bias : Jackknife estimate of bias.
better_bootstrap_bias : Bootstrap bias estimate with faster
convergence for plug-in statistics.
Notes
-----
Per [ET93]_ (S10.6), bias-corrected estimators tend to have much
higher variance than the non-corrected version. This should be
assessed, for example, using the bootstrap to directly estimate
the standard error of the corrected and uncorrected estimators.
"""
if method == "better_bootstrap_bias":
if B is None:
B = 400
n = len(x)
p0 = np.ones((n,)) / n
b, theta_star = better_bootstrap_bias(
x, stat, B=B, return_samples=True, num_threads=num_threads, rng=rng
)
corrected = stat(x, p0) - b
if return_samples:
return corrected, theta_star
else:
return corrected
elif method == "bias":
if B is None:
B = 4000
b, theta_star = bias(
dist,
stat,
t,
B=B,
theta_star=theta_star,
return_samples=True,
num_threads=num_threads,
rng=rng,
)
corrected = stat(x) - b
if return_samples:
return corrected, theta_star
else:
return corrected
elif method == "jackknife":
b, jv = jackknife_bias(
x, stat, jv=jv, return_samples=True, num_threads=num_threads
)
corrected = stat(x) - b
if return_samples:
return corrected, jv
else:
return corrected