Source code for bootstrap_stat.prediction

# pyre-unsafe
"""Prediction error and prediction interval methods."""

import multiprocessing as mp
from collections.abc import Callable
from typing import Any

import numpy as np
import numpy.typing as npt
from pathos.multiprocessing import ProcessPool as Pool

from bootstrap_stat._utils import (
    ArrayLike,
    RNGLike,
    _apply_rng,
    _percentile,
    _spawn_rngs,
)
from bootstrap_stat.distributions import EmpiricalDistribution
from bootstrap_stat.sampling import bootstrap_samples


[docs] def prediction_error_optimism( dist: EmpiricalDistribution, data: ArrayLike, train: Callable[[ArrayLike], Any], predict: Callable[[Any, ArrayLike], Any], error: Callable[[Any, ArrayLike], npt.NDArray[np.float64]], B: int = 200, apparent_error: float | None = None, num_threads: int = 1, rng: RNGLike = None, ) -> float: """Prediction Error, Optimism Method Parameters ---------- dist : EmpiricalDistribution Empirical distribution. data : array_like or pandas DataFrame The data. train : function Function which takes as input a dataset sampled from the empirical distribution and returns a fitted model. predict : function Function which takes as input a fitted model and a dataset, and returns the predicted labels for that dataset error : function Function which takes as input a fitted model and a dataset, and returns the mean prediction error on that dataset. B : int, optional Number of bootstrap samples. Defaults to 200. apparent_error : float, optional The prediction error of the model on the dataset used to train the model, also known as the training error. If omitted, will be calculated. Can be passed to this function to save time, for example if the model had already been fit elsewhere. num_threads : int, optional Number of threads to use for multicore processing. Defaults to 1, meaning all calculations will be done in a single thread. Set to -1 to use all available cores. Returns ------- pe : float Prediction error. See Also -------- prediction_error_632 : .632 bootstrap estimate of prediction error. prediction_interval : Bootstrap prediction interval for a new observation. Notes ----- The bootstrap estimate of prediction error can be used for model selection. It is similar to cross validation. It adds a bias correction term to the apparent error (the accuracy of the predictor applied to the same dataset used to train the predictor). This bias correction term is called the optimism. See [ET93]_ (S17) for details. """ if apparent_error is None: mdl = train(data) pred = predict(mdl, data) apparent_error = np.mean(error(pred, data)) def stat(x): mdl_boot = train(x) pred_orig = predict(mdl_boot, data) # Predictions for original dataset pred_boot = predict(mdl_boot, x) # Predictions for bootstrap dataset err_orig = np.mean(error(pred_orig, data)) # Error for original dataset err_boot = np.mean(error(pred_boot, x)) # Error for bootstrap dataset optimism = err_orig - err_boot return optimism optimism = bootstrap_samples(dist, stat, B, num_threads=num_threads, rng=rng) pe = apparent_error + np.mean(optimism) return pe
[docs] def prediction_error_632( dist: EmpiricalDistribution, data: ArrayLike, train: Callable[[ArrayLike], Any], predict: Callable[[Any, ArrayLike], Any], error: Callable[[Any, ArrayLike], npt.NDArray[np.float64]], B: int = 200, apparent_error: float | None = None, use_632_plus: bool = False, gamma: float | None = None, no_inf_err_rate: Callable[[Any, ArrayLike], float] | None = None, num_threads: int = 1, rng: RNGLike = None, ) -> float: """.632 Bootstrap Parameters ---------- dist : EmpiricalDistribution Empirical distribution. data : array_like or pandas DataFrame The data. train : function Function which takes as input a dataset sampled from the empirical distribution and returns a fitted model. predict : function Function which takes as input a fitted model and a dataset, and returns the predicted labels for that dataset error : function Function which takes as input a fitted model and a dataset, and returns the prediction error *for each observation* in that dataset. B : int, optional Number of bootstrap samples. Defaults to 200. apparent_error : float, optional The prediction error of the model on the dataset used to train the model, also known as the training error. If omitted, will be calculated. Can be passed to this function to save time, for example if the model had already been fit elsewhere. use_632_plus : boolean, optional If True, uses the .632+ bootstrap. See Notes. gamma : float, optional The no-information error rate. Required when `use_632_plus` is True and `no_inf_err_rate` is not provided. no_inf_err_rate : function, optional A function that computes the no-information error rate from predictions and data. Required when `use_632_plus` is True and `gamma` is not provided. Takes as input the predictions and the dataset, and returns a float. num_threads : int, optional Number of threads to use for multicore processing. Defaults to 1, meaning all calculations will be done in a single thread. Set to -1 to use all available cores. Returns ------- pe : float Prediction error. See Also -------- prediction_error_optimism : Optimism-based bootstrap prediction error. prediction_interval : Bootstrap prediction interval for a new observation. Notes ----- The .632 bootstrap estimate of prediction error is the weighted average of the apparent error and another term called eps0. The latter term is kind of like cross validation: we generate a bootstrap sample, train a model on it, and then make predictions using that model on the original dataset. But we only care about the predictions on observations that are *not* part of the bootstrap sample. We then average those prediction errors across all bootstrap samples. See [ET93]_ (S17.7) for details. The method is so-called because the estimated prediction error is .368 times the apparent error plus .632 times eps0. [ET93]_ reports that this method performed better than leave-one-out cross validation in their simulations, having lower variance, but they themselves admit they had not thoroughly evaluated it. [Koh95]_ reported that the .632 bootstrap performed quite poorly when overfitting is present. This led Efron and Tibshirani to propose the .632+ bootstrap in [ET97]_. Finally, [Arl10]_ surveys various approaches to model selection, recommending 10-fold cross validation as the preferred method of model selection. Because the .632 bootstrap has apparently not withstood the test of time, I have made no attempt to implement it efficiently, instead preferring the easy-to-follow approach below. This function should really only serve pedagogical purposes: it is not recommended for serious applications! The no information error rate looks at the prediction at point j, and computes the error *for every label y_i*. It averages the errors over all i and j. This is because, if we assume the features offer no insight into the labels, any observation is as good as any other at predicting any particular label. """ if num_threads == -1: num_threads = mp.cpu_count() if apparent_error is None or (use_632_plus and gamma is None): mdl = train(data) pred = predict(mdl, data) if apparent_error is None: apparent_error = np.mean(error(pred, data)) if use_632_plus and gamma is None: if no_inf_err_rate is None: raise ValueError( "Please specify either the no information error rate," "gamma, or a function to calculate it." ) gamma = no_inf_err_rate(pred, data) if rng is not None: _apply_rng(dist, rng) def _bootstrap_sim(dist, data, train, predict, error, B, worker_rng): _apply_rng(dist, worker_rng) n = len(data) # Total prediction error for each observation, over those # bootstrap samples that do not include that observation. Qi = np.zeros((n,)) # Number of times the ith observation appeared in a bootstrap # sample. Bi = np.zeros((n,)) for j in range(B): x_star, s_star = dist.sample(return_indices=True) mdl_boot = train(x_star) pred_orig = predict(mdl_boot, data) q = error(pred_orig, data) for i in range(n): if i not in s_star: Bi[i] += 1 Qi[i] += q[i] return Bi, Qi if num_threads == 1: Bi, Qi = _bootstrap_sim(dist, data, train, predict, error, B, dist._rng) else: pool = Pool(num_threads) try: pool.restart() except AssertionError: pass results = [] batch_size = B // num_threads extra = B % num_threads batch_sizes = [batch_size] * num_threads for i in range(extra): batch_sizes[i] += 1 worker_rngs = _spawn_rngs(dist._rng, num_threads) for i, worker_rng in enumerate(worker_rngs): r = pool.apipe( _bootstrap_sim, dist, data, train, predict, error, batch_sizes[i], worker_rng, ) results.append(r) n = len(data) Bi = np.zeros((n,)) Qi = np.zeros((n,)) for res in results: Bii, Qii = res.get() Bi += Bii Qi += Qii pool.close() pool.join() eps0 = np.mean(Qi / Bi) err_632 = 0.368 * apparent_error + 0.632 * eps0 if not use_632_plus: return err_632 else: R = (eps0 - apparent_error) / (gamma - apparent_error) if eps0 <= gamma: eps0p = eps0 else: eps0p = gamma if eps0p > apparent_error: Rp = R else: Rp = 0 err_632p = err_632 + ( (eps0p - apparent_error) * (0.368 * 0.632 * Rp) / (1 - 0.368 * Rp) ) return err_632p
[docs] def prediction_interval( dist: EmpiricalDistribution, x: ArrayLike, mean: Callable[[ArrayLike], float] | None = None, std: Callable[[ArrayLike], float] | None = None, B: int = 1000, alpha: float = 0.05, t_star: npt.NDArray[np.float64] | None = None, return_t_star: bool = False, num_threads: int = -1, rng: RNGLike = None, ) -> tuple[float, float] | tuple[float, float, npt.NDArray[np.float64]]: r"""Prediction interval Parameters ---------- dist : EmpiricalDistribution The empirical distribution. x : array_like or pandas DataFrame The data. mean : function, optional A function returning the mean of a bootstrap sample. Defaults to np.mean, but this only works for arrays, not DataFrames. To emphasize, this function must return a float! std : function, optional A function returning the standard deviation of a bootstrap sample. Defaults to np.std using ddof=1. As with `mean`, specify something different for DataFrames! B : int, optional Number of bootstrap samples. Defaults to 1000. alpha : float, optional Number controlling the size of the interval. That is, this function will return a :math:`100(1 - 2\alpha)\%` prediction interval. Defaults to 0.05. t_star : array_like or None Array of studentized values, used to calculate the interval. Can be passed to this function to speed it up, for example when calculating multiple intervals. return_t_star : boolean, optional If True, return the studentized values. (Sometimes it is helpful to plot these.) num_threads : int, optional Number of threads to use for multicore processing. Defaults to -1, meaning all available cores will be used. Set to 1 to disable multicore processing. Returns ------- pred_low, pred_high : float A :math:`100(1 - 2\alpha)\%` prediction interval on a point sampled from F. t_star : array Array of studentized values. Returned only if `return_t_star` is True. See Also -------- prediction_error_optimism : Bootstrap estimate of prediction error, optimism method. prediction_error_632 : .632 bootstrap estimate of prediction error. Notes ----- Suppose we observe :math:`X_1, X_2, \ldots, X_n` sampled IID from a distribution :math:`F`. We wish to calculate a range of plausible values for a new point :math:`X_{n+1}` drawn from the same distribution. This function returns such a *prediction interval*. For each bootstrap replicate, the method draws a sample of size :math:`n` from :math:`\hat{F}` and a single additional point :math:`z^*`, then forms the studentized quantity :math:`t^* = (\bar{X}^* - z^*) / s^*`. The :math:`\alpha` and :math:`1 - \alpha` quantiles of :math:`t^*` are inverted to produce the interval. The construction mirrors the classical t-distribution prediction interval, but makes no parametric assumption on :math:`F`. See [ET93]_ (S6.5) for details. """ if num_threads == -1: num_threads = mp.cpu_count() if mean is None: def mean(x): return np.mean(x) if std is None: def std(x): return np.std(x, ddof=1) x_bar = mean(x) s = std(x) if rng is not None: _apply_rng(dist, rng) def _bootstrap_sim(dist, mean, std, batch_size, worker_rng): _apply_rng(dist, worker_rng) t_star = np.empty((batch_size,)) for i in range(batch_size): x_star = dist.sample() z_star = dist.sample(size=1) s_star = std(x_star) t_star[i] = (mean(x_star) - mean(z_star)) / s_star return t_star if t_star is None: if num_threads == 1: t_star = _bootstrap_sim(dist, mean, std, B, dist._rng) else: pool = Pool(num_threads) try: pool.restart() except AssertionError: pass results = [] batch_size = B // num_threads extra = B % num_threads batch_sizes = [batch_size] * num_threads for i in range(extra): batch_sizes[i] += 1 worker_rngs = _spawn_rngs(dist._rng, num_threads) for i, worker_rng in enumerate(worker_rngs): r = pool.apipe( _bootstrap_sim, dist, mean, std, batch_sizes[i], worker_rng ) results.append(r) t_star = np.hstack([res.get() for res in results]) pool.close() pool.join() t_alpha = _percentile(t_star, [alpha, 1 - alpha]) pred_low = x_bar - t_alpha[1] * s pred_high = x_bar - t_alpha[0] * s if return_t_star: return pred_low, pred_high, t_star else: return pred_low, pred_high