# pyre-unsafe
"""Prediction error and prediction interval methods."""
import multiprocessing as mp
from collections.abc import Callable
from typing import Any
import numpy as np
import numpy.typing as npt
from pathos.multiprocessing import ProcessPool as Pool
from bootstrap_stat._utils import (
ArrayLike,
RNGLike,
_apply_rng,
_percentile,
_spawn_rngs,
)
from bootstrap_stat.distributions import EmpiricalDistribution
from bootstrap_stat.sampling import bootstrap_samples
[docs]
def prediction_error_optimism(
dist: EmpiricalDistribution,
data: ArrayLike,
train: Callable[[ArrayLike], Any],
predict: Callable[[Any, ArrayLike], Any],
error: Callable[[Any, ArrayLike], npt.NDArray[np.float64]],
B: int = 200,
apparent_error: float | None = None,
num_threads: int = 1,
rng: RNGLike = None,
) -> float:
"""Prediction Error, Optimism Method
Parameters
----------
dist : EmpiricalDistribution
Empirical distribution.
data : array_like or pandas DataFrame
The data.
train : function
Function which takes as input a dataset sampled from the
empirical distribution and returns a fitted model.
predict : function
Function which takes as input a fitted model and a dataset, and
returns the predicted labels for that dataset
error : function
Function which takes as input a fitted model and a dataset, and
returns the mean prediction error on that dataset.
B : int, optional
Number of bootstrap samples. Defaults to 200.
apparent_error : float, optional
The prediction error of the model on the dataset used to train
the model, also known as the training error. If omitted, will
be calculated. Can be passed to this function to save time,
for example if the model had already been fit elsewhere.
num_threads : int, optional
Number of threads to use for multicore processing. Defaults to
1, meaning all calculations will be done in a single
thread. Set to -1 to use all available cores.
Returns
-------
pe : float
Prediction error.
See Also
--------
prediction_error_632 : .632 bootstrap estimate of prediction error.
prediction_interval : Bootstrap prediction interval for a new
observation.
Notes
-----
The bootstrap estimate of prediction error can be used for model
selection. It is similar to cross validation. It adds a bias
correction term to the apparent error (the accuracy of the
predictor applied to the same dataset used to train the
predictor). This bias correction term is called the optimism. See
[ET93]_ (S17) for details.
"""
if apparent_error is None:
mdl = train(data)
pred = predict(mdl, data)
apparent_error = np.mean(error(pred, data))
def stat(x):
mdl_boot = train(x)
pred_orig = predict(mdl_boot, data) # Predictions for original dataset
pred_boot = predict(mdl_boot, x) # Predictions for bootstrap dataset
err_orig = np.mean(error(pred_orig, data)) # Error for original dataset
err_boot = np.mean(error(pred_boot, x)) # Error for bootstrap dataset
optimism = err_orig - err_boot
return optimism
optimism = bootstrap_samples(dist, stat, B, num_threads=num_threads, rng=rng)
pe = apparent_error + np.mean(optimism)
return pe
[docs]
def prediction_error_632(
dist: EmpiricalDistribution,
data: ArrayLike,
train: Callable[[ArrayLike], Any],
predict: Callable[[Any, ArrayLike], Any],
error: Callable[[Any, ArrayLike], npt.NDArray[np.float64]],
B: int = 200,
apparent_error: float | None = None,
use_632_plus: bool = False,
gamma: float | None = None,
no_inf_err_rate: Callable[[Any, ArrayLike], float] | None = None,
num_threads: int = 1,
rng: RNGLike = None,
) -> float:
""".632 Bootstrap
Parameters
----------
dist : EmpiricalDistribution
Empirical distribution.
data : array_like or pandas DataFrame
The data.
train : function
Function which takes as input a dataset sampled from the
empirical distribution and returns a fitted model.
predict : function
Function which takes as input a fitted model and a dataset, and
returns the predicted labels for that dataset
error : function
Function which takes as input a fitted model and a dataset, and
returns the prediction error *for each observation* in that
dataset.
B : int, optional
Number of bootstrap samples. Defaults to 200.
apparent_error : float, optional
The prediction error of the model on the dataset used to train
the model, also known as the training error. If omitted, will
be calculated. Can be passed to this function to save time,
for example if the model had already been fit elsewhere.
use_632_plus : boolean, optional
If True, uses the .632+ bootstrap. See Notes.
gamma : float, optional
The no-information error rate. Required when `use_632_plus` is
True and `no_inf_err_rate` is not provided.
no_inf_err_rate : function, optional
A function that computes the no-information error rate from
predictions and data. Required when `use_632_plus` is True and
`gamma` is not provided. Takes as input the predictions and the
dataset, and returns a float.
num_threads : int, optional
Number of threads to use for multicore processing. Defaults to
1, meaning all calculations will be done in a single
thread. Set to -1 to use all available cores.
Returns
-------
pe : float
Prediction error.
See Also
--------
prediction_error_optimism : Optimism-based bootstrap prediction
error.
prediction_interval : Bootstrap prediction interval for a new
observation.
Notes
-----
The .632 bootstrap estimate of prediction error is the weighted
average of the apparent error and another term called eps0. The
latter term is kind of like cross validation: we generate a
bootstrap sample, train a model on it, and then make predictions
using that model on the original dataset. But we only care about
the predictions on observations that are *not* part of the
bootstrap sample. We then average those prediction errors across
all bootstrap samples. See [ET93]_ (S17.7) for details.
The method is so-called because the estimated prediction error is
.368 times the apparent error plus .632 times eps0. [ET93]_ reports
that this method performed better than leave-one-out cross
validation in their simulations, having lower variance, but they
themselves admit they had not thoroughly evaluated it. [Koh95]_
reported that the .632 bootstrap performed quite poorly when
overfitting is present. This led Efron and Tibshirani to propose
the .632+ bootstrap in [ET97]_. Finally, [Arl10]_ surveys various
approaches to model selection, recommending 10-fold cross
validation as the preferred method of model selection.
Because the .632 bootstrap has apparently not withstood the test
of time, I have made no attempt to implement it efficiently,
instead preferring the easy-to-follow approach below. This
function should really only serve pedagogical purposes: it is not
recommended for serious applications!
The no information error rate looks at the prediction at point j,
and computes the error *for every label y_i*. It averages the
errors over all i and j. This is because, if we assume the
features offer no insight into the labels, any observation is as
good as any other at predicting any particular label.
"""
if num_threads == -1:
num_threads = mp.cpu_count()
if apparent_error is None or (use_632_plus and gamma is None):
mdl = train(data)
pred = predict(mdl, data)
if apparent_error is None:
apparent_error = np.mean(error(pred, data))
if use_632_plus and gamma is None:
if no_inf_err_rate is None:
raise ValueError(
"Please specify either the no information error rate,"
"gamma, or a function to calculate it."
)
gamma = no_inf_err_rate(pred, data)
if rng is not None:
_apply_rng(dist, rng)
def _bootstrap_sim(dist, data, train, predict, error, B, worker_rng):
_apply_rng(dist, worker_rng)
n = len(data)
# Total prediction error for each observation, over those
# bootstrap samples that do not include that observation.
Qi = np.zeros((n,))
# Number of times the ith observation appeared in a bootstrap
# sample.
Bi = np.zeros((n,))
for j in range(B):
x_star, s_star = dist.sample(return_indices=True)
mdl_boot = train(x_star)
pred_orig = predict(mdl_boot, data)
q = error(pred_orig, data)
for i in range(n):
if i not in s_star:
Bi[i] += 1
Qi[i] += q[i]
return Bi, Qi
if num_threads == 1:
Bi, Qi = _bootstrap_sim(dist, data, train, predict, error, B, dist._rng)
else:
pool = Pool(num_threads)
try:
pool.restart()
except AssertionError:
pass
results = []
batch_size = B // num_threads
extra = B % num_threads
batch_sizes = [batch_size] * num_threads
for i in range(extra):
batch_sizes[i] += 1
worker_rngs = _spawn_rngs(dist._rng, num_threads)
for i, worker_rng in enumerate(worker_rngs):
r = pool.apipe(
_bootstrap_sim,
dist,
data,
train,
predict,
error,
batch_sizes[i],
worker_rng,
)
results.append(r)
n = len(data)
Bi = np.zeros((n,))
Qi = np.zeros((n,))
for res in results:
Bii, Qii = res.get()
Bi += Bii
Qi += Qii
pool.close()
pool.join()
eps0 = np.mean(Qi / Bi)
err_632 = 0.368 * apparent_error + 0.632 * eps0
if not use_632_plus:
return err_632
else:
R = (eps0 - apparent_error) / (gamma - apparent_error)
if eps0 <= gamma:
eps0p = eps0
else:
eps0p = gamma
if eps0p > apparent_error:
Rp = R
else:
Rp = 0
err_632p = err_632 + (
(eps0p - apparent_error) * (0.368 * 0.632 * Rp) / (1 - 0.368 * Rp)
)
return err_632p
[docs]
def prediction_interval(
dist: EmpiricalDistribution,
x: ArrayLike,
mean: Callable[[ArrayLike], float] | None = None,
std: Callable[[ArrayLike], float] | None = None,
B: int = 1000,
alpha: float = 0.05,
t_star: npt.NDArray[np.float64] | None = None,
return_t_star: bool = False,
num_threads: int = -1,
rng: RNGLike = None,
) -> tuple[float, float] | tuple[float, float, npt.NDArray[np.float64]]:
r"""Prediction interval
Parameters
----------
dist : EmpiricalDistribution
The empirical distribution.
x : array_like or pandas DataFrame
The data.
mean : function, optional
A function returning the mean of a bootstrap sample. Defaults
to np.mean, but this only works for arrays, not DataFrames. To
emphasize, this function must return a float!
std : function, optional
A function returning the standard deviation of a bootstrap
sample. Defaults to np.std using ddof=1. As with `mean`,
specify something different for DataFrames!
B : int, optional
Number of bootstrap samples. Defaults to 1000.
alpha : float, optional
Number controlling the size of the interval. That is, this
function will return a :math:`100(1 - 2\alpha)\%` prediction
interval. Defaults to 0.05.
t_star : array_like or None
Array of studentized values, used to calculate the interval.
Can be passed to this function to speed it up, for example
when calculating multiple intervals.
return_t_star : boolean, optional
If True, return the studentized values. (Sometimes it is
helpful to plot these.)
num_threads : int, optional
Number of threads to use for multicore processing. Defaults to
-1, meaning all available cores will be used. Set to 1 to
disable multicore processing.
Returns
-------
pred_low, pred_high : float
A :math:`100(1 - 2\alpha)\%` prediction interval on a point sampled
from F.
t_star : array
Array of studentized values. Returned only if `return_t_star`
is True.
See Also
--------
prediction_error_optimism : Bootstrap estimate of prediction error,
optimism method.
prediction_error_632 : .632 bootstrap estimate of prediction error.
Notes
-----
Suppose we observe :math:`X_1, X_2, \ldots, X_n` sampled IID from
a distribution :math:`F`. We wish to calculate a range of
plausible values for a new point :math:`X_{n+1}` drawn from the
same distribution. This function returns such a *prediction
interval*.
For each bootstrap replicate, the method draws a sample of size
:math:`n` from :math:`\hat{F}` and a single additional point
:math:`z^*`, then forms the studentized quantity
:math:`t^* = (\bar{X}^* - z^*) / s^*`. The
:math:`\alpha` and :math:`1 - \alpha` quantiles of :math:`t^*`
are inverted to produce the interval. The construction mirrors the
classical t-distribution prediction interval, but makes no
parametric assumption on :math:`F`. See [ET93]_ (S6.5) for
details.
"""
if num_threads == -1:
num_threads = mp.cpu_count()
if mean is None:
def mean(x):
return np.mean(x)
if std is None:
def std(x):
return np.std(x, ddof=1)
x_bar = mean(x)
s = std(x)
if rng is not None:
_apply_rng(dist, rng)
def _bootstrap_sim(dist, mean, std, batch_size, worker_rng):
_apply_rng(dist, worker_rng)
t_star = np.empty((batch_size,))
for i in range(batch_size):
x_star = dist.sample()
z_star = dist.sample(size=1)
s_star = std(x_star)
t_star[i] = (mean(x_star) - mean(z_star)) / s_star
return t_star
if t_star is None:
if num_threads == 1:
t_star = _bootstrap_sim(dist, mean, std, B, dist._rng)
else:
pool = Pool(num_threads)
try:
pool.restart()
except AssertionError:
pass
results = []
batch_size = B // num_threads
extra = B % num_threads
batch_sizes = [batch_size] * num_threads
for i in range(extra):
batch_sizes[i] += 1
worker_rngs = _spawn_rngs(dist._rng, num_threads)
for i, worker_rng in enumerate(worker_rngs):
r = pool.apipe(
_bootstrap_sim, dist, mean, std, batch_sizes[i], worker_rng
)
results.append(r)
t_star = np.hstack([res.get() for res in results])
pool.close()
pool.join()
t_alpha = _percentile(t_star, [alpha, 1 - alpha])
pred_low = x_bar - t_alpha[1] * s
pred_high = x_bar - t_alpha[0] * s
if return_t_star:
return pred_low, pred_high, t_star
else:
return pred_low, pred_high