Source code for bootstrap_stat.bias

# pyre-unsafe
"""Bias estimation and correction methods."""

import multiprocessing as mp
from collections.abc import Callable
from typing import Any, Literal

import numpy as np
import numpy.typing as npt
from pathos.multiprocessing import ProcessPool as Pool

from bootstrap_stat._utils import (
    ArrayLike,
    RNGLike,
    Statistic,
    _resampling_vector,
    _spawn_rngs,
)
from bootstrap_stat.distributions import EmpiricalDistribution
from bootstrap_stat.sampling import bootstrap_samples, jackknife_values



[docs]
def bias(
    dist: EmpiricalDistribution,
    stat: Statistic,
    t: Statistic,
    B: int = 200,
    return_samples: bool = False,
    theta_star: npt.NDArray[np.float64] | None = None,
    num_threads: int = 1,
    rng: RNGLike = None,
) -> float | tuple[float, npt.NDArray[np.float64]]:
    r"""Estimate of bias

    Parameters
    ----------
     dist : EmpiricalDistribution
        The empirical distribution.
     stat : function
        The statistic for which we wish to calculate the bias.
     t : function
        Function to be applied to empirical distribution function.
     B : int, optional
        Number of bootstrap samples. Defaults to 200.
     return_samples : boolean, optional
        If True, return the bootstrapped statistic values. Defaults to False.
     theta_star : array_like, optional
        Bootstrapped statistic values. Can be passed if they have
        already been calculated, which will speed this up
        considerably.
     num_threads : int, optional
        Number of threads to use for multicore processing. Defaults to
        1, meaning all calculations will be done in a single
        thread. Set to -1 to use all available cores.

    Returns
    -------
     bias : float
        Estimate of bias.
     theta_star : ndarray
        Array of bootstrapped statistic values. Only returned if
        `return_samples` is True.

    See Also
    --------
    jackknife_bias : Jackknife estimate of bias.
    better_bootstrap_bias : Bootstrap bias estimate with faster
        convergence for plug-in statistics.
    bias_corrected : Bias-corrected estimator.

    Notes
    -----
    The bootstrap bias estimate is

    .. math::

        \widehat{\mathrm{bias}} = \bar{\theta}^* - t(\hat{F}),

    where :math:`\bar{\theta}^* = B^{-1}\sum_{b=1}^{B}\mathrm{stat}(x^*_b)`
    is the mean over bootstrap samples and :math:`t(\hat{F})` is the
    plug-in value of the parameter under the empirical distribution.
    For a smooth plug-in statistic :math:`t(\hat{F}) = \mathrm{stat}(x)`,
    but the two arguments allow the caller to supply a different plug-in
    value when needed. See [ET93]_ (S10.2) for details.

    For plug-in statistics, :func:`better_bootstrap_bias` converges
    faster using the same number of bootstrap samples. For non-smooth
    statistics, :func:`jackknife_bias` is an alternative.

    """
    if theta_star is None:
        theta_star = bootstrap_samples(dist, stat, B, num_threads=num_threads, rng=rng)

    tF_hat = dist.calculate_parameter(t)
    bias_est = np.mean(theta_star) - tF_hat

    if return_samples:
        return bias_est, theta_star
    else:
        return bias_est




[docs]
def better_bootstrap_bias(
    x: ArrayLike,
    stat: Callable[[Any, npt.NDArray[np.float64]], float],
    B: int = 400,
    return_samples: bool = False,
    num_threads: int = 1,
    rng: RNGLike = None,
) -> float | tuple[float, npt.NDArray[np.float64]]:
    r"""Better bootstrap bias.

    Parameters
    ----------
     x : array_like or pandas DataFrame
        The data.
     stat : function
        The statistic. Should take `x` and a resampling vector as
        input. See Notes.
     B : int, optional
        Number of bootstrap samples. Defaults to 400.
     return_samples : boolean, optional
        If True, return the bootstrapped statistic values. Defaults to False.
     num_threads : int, optional
        Number of threads to use for multicore processing. Defaults to
        1, meaning all calculations will be done in a single
        thread. Set to -1 to use all available cores.

    Returns
    -------
     bias : float
        Estimate of bias.
     theta_star : ndarray
        Array of bootstrapped statistic values. Only returned if
        `return_samples` is True.

    See Also
    --------
    bias : General bootstrap estimate of bias.
    jackknife_bias : Jackknife estimate of bias.
    bias_corrected : Bias-corrected estimator.

    Notes
    -----
    The "better" bootstrap bias estimate is only applicable when
    `stat` is a plug-in statistic for the parameter being estimated,
    that is, having the form :math:`t(\hat{F})`, where :math:`\hat{F}`
    is the empirical distribution. Notable situations where this
    assumption does not hold include robust statistics like using the
    alpha-trimmed mean, since that is not the plug-in statistic for
    the mean. In cases like that, just use the "worse" :func:`bias`
    function. The advantage of the "better" bootstrap bias estimate is
    faster convergence.  Whereas using `B` = 400 is typically adequate
    here, it can take thousands of bootstrap samples to give an
    accurate estimate in the "worse" :func:`bias` function.

    """
    if num_threads == -1:
        num_threads = mp.cpu_count()

    parent_rng = np.random.default_rng(rng)

    def _bootstrap_sim(x, stat, batch_size, worker_rng):
        n = len(x)
        sum_p = np.zeros((n,))
        theta_star = np.zeros((batch_size,))
        for i in range(batch_size):
            p = _resampling_vector(n, rng=worker_rng)
            sum_p += p
            theta_star[i] = stat(x, p)

        return theta_star, sum_p

    if num_threads == 1:
        theta_star, sum_p = _bootstrap_sim(x, stat, B, parent_rng)
    else:
        pool = Pool(num_threads)
        try:
            pool.restart()
        except AssertionError:
            pass

        results = []
        batch_size = B // num_threads
        extra = B % num_threads
        batch_sizes = [batch_size] * num_threads
        for i in range(extra):
            batch_sizes[i] += 1

        worker_rngs = _spawn_rngs(parent_rng, num_threads)
        for i, worker_rng in enumerate(worker_rngs):
            r = pool.apipe(_bootstrap_sim, x, stat, batch_sizes[i], worker_rng)
            results.append(r)

        theta_star = []
        sum_p = np.zeros((len(x),))
        for res in results:
            theta_star_i, sum_p_i = res.get()
            theta_star.append(theta_star_i)
            sum_p += sum_p_i

        theta_star = np.hstack([t for t in theta_star])
        pool.close()
        pool.join()

    bias_est = np.mean(theta_star) - stat(x, sum_p / len(theta_star))

    if return_samples:
        return bias_est, theta_star
    else:
        return bias_est




[docs]
def jackknife_bias(
    x: ArrayLike,
    stat: Statistic,
    return_samples: bool = False,
    jv: npt.NDArray[np.float64] | None = None,
    num_threads: int = 1,
) -> float | tuple[float, npt.NDArray[np.float64]]:
    r"""Jackknife estimate of bias.

    Parameters
    ----------
     x : array_like or pandas DataFrame
        The data.
     stat : function
        The statistic.
     return_samples : boolean, optional
        If True, return the jackknife values. Defaults to False.
     jv : array_like, optional
        Jackknife values. Can be passed if they have already been
        calculated, which will speed this up considerably.
     num_threads : int, optional
        Number of threads to use for multicore processing. Defaults to
        1, meaning all calculations will be done in a single
        thread. Set to -1 to use all available cores.

    Returns
    -------
     bias : float
        Estimate of bias.
     jv : ndarray
        Jackknife values. Only returned if `return_samples` is True.

    See Also
    --------
    bias : Bootstrap estimate of bias.
    better_bootstrap_bias : Bootstrap bias estimate with faster
        convergence for plug-in statistics.
    bias_corrected : Bias-corrected estimator.

    Notes
    -----
    The jackknife estimate of bias is only applicable when `stat` is a
    plug-in statistic, that is, having the form :math:`t(\hat{F})`,
    where :math:`\hat{F}` is the empirical distribution. Moreover, it
    is only applicable when `t` is a smooth function. Notable
    exceptions include the median. The jackknife cannot be used to
    estimate the bias of non-smooth estimators. See [ET93]_ (S10.5) for
    details.

    """
    if jv is None:
        jv = jackknife_values(x, stat, num_threads=num_threads)

    n = len(jv)
    bias_est = (n - 1) * (np.mean(jv) - stat(x))
    if return_samples:
        return bias_est, jv
    else:
        return bias_est




[docs]
def bias_corrected(
    x: ArrayLike,
    stat: Statistic | Callable[[Any, npt.NDArray[np.float64]], float],
    method: Literal[
        "better_bootstrap_bias", "bias", "jackknife"
    ] = "better_bootstrap_bias",
    dist: EmpiricalDistribution | None = None,
    t: Statistic | None = None,
    B: int | None = None,
    return_samples: bool = False,
    theta_star: npt.NDArray[np.float64] | None = None,
    jv: npt.NDArray[np.float64] | None = None,
    num_threads: int = 1,
    rng: RNGLike = None,
) -> float | tuple[float, npt.NDArray[np.float64]]:
    """Bias-corrected estimator.

    Parameters
    ----------
     x : array_like or pandas DataFrame
        The data.
     stat : function
        The statistic. For use with "bias" and "jackknife" methods,
        should take a dataset as the input. For use with the
        "better_bootstrap_bias" method, it should take the dataset and
        a resampling vector as input. See the documentation in the
        better_bootstrap_bias function for details.
     method : ["better_bootstrap_bias", "bias", "jackknife"]
        The method by which we correct for bias. Defaults to
        "better_bootstrap_bias".
     dist : EmpiricalDistribution
        The empirical distribution. Required when method == "bias".
     t : function
        Function to be applied to empirical distribution
        function. Required when method == "bias".
     B : int, optional
        Number of bootstrap samples. Required when method == "bias" or
        "better_bootstrap_bias". Defaults to 400 when method ==
        "better_bootstrap_bias" or 4000 when method == "bias".
     return_samples : boolean, optional
        If True, return the bootstrapped statistic or jackknife
        values. Defaults to False.
     theta_star : array_like, optional
        Bootstrapped statistic values. Can be passed if they have
        already been calculated, which will speed this up
        considerably. Only used when method == "bias".
     jv : array_like, optional
        Jackknife values. Can be passed if they have already been
        calculated, which will speed this up considerably. Only used
        when method == "jackknife".
     num_threads : int, optional
        Number of threads to use for multicore processing. Defaults to
        1, meaning all calculations will be done in a single
        thread. Set to -1 to use all available cores.


    Returns
    -------
     theta_bar : float
        The bias-corrected estimator.
     theta_star : ndarray
        Array of bootstrapped statistic values. Only returned if
        `return_samples` is True and method is either "bias" or
        "better_bootstrap_bias".
     jv : ndarray
        Jackknife values. Only returned if `return_samples` is True
        and method == "jackknife".

    See Also
    --------
    bias : Bootstrap estimate of bias.
    jackknife_bias : Jackknife estimate of bias.
    better_bootstrap_bias : Bootstrap bias estimate with faster
        convergence for plug-in statistics.

    Notes
    -----
    Per [ET93]_ (S10.6), bias-corrected estimators tend to have much
    higher variance than the non-corrected version. This should be
    assessed, for example, using the bootstrap to directly estimate
    the standard error of the corrected and uncorrected estimators.

    """
    if method == "better_bootstrap_bias":
        if B is None:
            B = 400
        n = len(x)
        p0 = np.ones((n,)) / n
        b, theta_star = better_bootstrap_bias(
            x, stat, B=B, return_samples=True, num_threads=num_threads, rng=rng
        )
        corrected = stat(x, p0) - b
        if return_samples:
            return corrected, theta_star
        else:
            return corrected
    elif method == "bias":
        if B is None:
            B = 4000

        b, theta_star = bias(
            dist,
            stat,
            t,
            B=B,
            theta_star=theta_star,
            return_samples=True,
            num_threads=num_threads,
            rng=rng,
        )
        corrected = stat(x) - b
        if return_samples:
            return corrected, theta_star
        else:
            return corrected
    elif method == "jackknife":
        b, jv = jackknife_bias(
            x, stat, jv=jv, return_samples=True, num_threads=num_threads
        )
        corrected = stat(x) - b
        if return_samples:
            return corrected, jv
        else:
            return corrected