# pyre-unsafe
"""Dataset loading functions for bootstrap examples."""
import os
from typing import Literal
import pandas as pd
[docs]
def mouse_data(dataset: Literal["control", "treatment"]) -> list[int]:
"""Mouse data from [ET93]_, Table 2.2.
Survival times in days of mice in a treatment/control experiment.
Parameters
----------
dataset : ["control", "treatment"]
Which group to return.
Returns
-------
data : list of int
Survival times in days.
"""
treatment = [94, 197, 16, 38, 99, 141, 23]
control = [52, 104, 146, 10, 51, 30, 40, 27, 46]
if dataset == "control":
return control
elif dataset == "treatment":
return treatment
else:
raise ValueError("Please specify either 'control' or 'treatment'")
[docs]
def law_data(full: bool = False) -> pd.DataFrame:
"""Law School Data
Collection of N = 82 American law schools participating in a large
study of admissions practices. Two measurements were made on the
entering classes of each school in 1973: LSAT, the average score
for the class on a national law test, and GPA, the average
undergraduate grade-point average for the class.
Additionally, a sample of n = 15 schools were taken.
Data from Table 3.2 in An Introduction to the Bootstrap by Bradley
Efron and Robert J. Tibshirani.
Parameters
----------
full : boolean, optional
If True, return the full (N = 82) dataset. If False (default),
return the sampled (n = 15) dataset.
Returns
-------
df : pandas DataFrame
The dataset.
"""
fn = os.path.join(os.path.dirname(__file__), "data", "law_school.csv")
df = pd.read_csv(fn)
df.set_index("school", inplace=True)
sample = [6, 13, 79, 35, 70, 52, 50, 15, 47, 31, 4, 82, 45, 36, 53]
sample = [s - 1 for s in sample]
if not full:
df = df.iloc[sample]
df.reset_index(inplace=True, drop=True)
return df
[docs]
def rainfall_data() -> pd.DataFrame:
"""Rainfall data from [ET93]_, Table 4.2.
Yearly rainfall in inches in Nevada City, California, 1873 through
1978. An example of time series data.
Returns
-------
df : pandas DataFrame
DataFrame indexed by ``year`` with column ``rainfall``.
"""
fn = os.path.join(os.path.dirname(__file__), "data", "rainfall.csv")
df = pd.read_csv(fn)
df.set_index("year", inplace=True)
return df
[docs]
def spatial_test_data(
test: Literal["A", "B", "both"] = "both",
) -> pd.DataFrame | pd.Series:
"""Spatial Test Data.
n = 26 children have each taken two tests of spatial ability,
called A and B. Table 14.1 in An Introduction to the Bootstrap by
Bradley Efron and Robert J. Tibshirani.
Parameters
----------
test : ["A", "B", "both"], optional
Which test results to return. Defaults to "both".
Returns
-------
df : pandas DataFrame or Series
The data.
"""
fn = os.path.join(os.path.dirname(__file__), "data", "spatial_test.csv")
df = pd.read_csv(fn)
df.set_index("Child", inplace=True)
if test == "both":
return df
elif test in ["A", "B"]:
return df[test]
else:
raise ValueError("Invalid test")
[docs]
def hormone_data() -> pd.DataFrame:
"""Hormone data from [ET93]_, Table 9.1.
Amount in milligrams of anti-inflammatory hormone remaining in 27
devices after a certain number of hours of wear. Devices were
sampled from three manufacturing lots (A, B, C). Lot C appears to
have greater remaining hormone but was worn the fewest hours; a
regression analysis clarifies the situation.
Returns
-------
df : pandas DataFrame
Observations with columns for hours worn, manufacturing lot,
and remaining hormone level.
"""
fn = os.path.join(os.path.dirname(__file__), "data", "hormone_data.csv")
df = pd.read_csv(fn)
return df
[docs]
def patch_data() -> pd.DataFrame:
"""Patch data from [ET93]_, Table 10.1.
Eight subjects wore medical patches designed to increase blood
levels of a natural hormone. Each subject wore three patches: a
placebo, an "old" patch from an established plant, and a "new"
patch from a newly opened plant. Derived columns: ``z`` =
oldpatch - placebo, ``y`` = newpatch - oldpatch. The purpose of
the experiment was to show equivalence between the two plants.
Chapter 25 of [ET93]_ has an extended analysis.
Returns
-------
df : pandas DataFrame
DataFrame indexed by ``subject`` with columns ``placebo``,
``oldpatch``, ``newpatch``, ``z``, and ``y``.
"""
fn = os.path.join(os.path.dirname(__file__), "data", "patch.csv")
df = pd.read_csv(fn)
df.set_index("subject", inplace=True)
df["z"] = df["oldpatch"] - df["placebo"]
df["y"] = df["newpatch"] - df["oldpatch"]
return df