Source code for bootstrap_stat.datasets

# pyre-unsafe
"""Dataset loading functions for bootstrap examples."""

import os
from typing import Literal

import pandas as pd



[docs]
def mouse_data(dataset: Literal["control", "treatment"]) -> list[int]:
    """Mouse data from [ET93]_, Table 2.2.

    Survival times in days of mice in a treatment/control experiment.

    Parameters
    ----------
     dataset : ["control", "treatment"]
        Which group to return.

    Returns
    -------
     data : list of int
        Survival times in days.

    """
    treatment = [94, 197, 16, 38, 99, 141, 23]
    control = [52, 104, 146, 10, 51, 30, 40, 27, 46]

    if dataset == "control":
        return control
    elif dataset == "treatment":
        return treatment
    else:
        raise ValueError("Please specify either 'control' or 'treatment'")




[docs]
def law_data(full: bool = False) -> pd.DataFrame:
    """Law School Data

    Collection of N = 82 American law schools participating in a large
    study of admissions practices. Two measurements were made on the
    entering classes of each school in 1973: LSAT, the average score
    for the class on a national law test, and GPA, the average
    undergraduate grade-point average for the class.

    Additionally, a sample of n = 15 schools were taken.

    Data from Table 3.2 in An Introduction to the Bootstrap by Bradley
    Efron and Robert J. Tibshirani.

    Parameters
    ----------
     full : boolean, optional
        If True, return the full (N = 82) dataset. If False (default),
        return the sampled (n = 15) dataset.

    Returns
    -------
     df : pandas DataFrame
        The dataset.

    """
    fn = os.path.join(os.path.dirname(__file__), "data", "law_school.csv")
    df = pd.read_csv(fn)
    df.set_index("school", inplace=True)

    sample = [6, 13, 79, 35, 70, 52, 50, 15, 47, 31, 4, 82, 45, 36, 53]
    sample = [s - 1 for s in sample]

    if not full:
        df = df.iloc[sample]
        df.reset_index(inplace=True, drop=True)

    return df




[docs]
def rainfall_data() -> pd.DataFrame:
    """Rainfall data from [ET93]_, Table 4.2.

    Yearly rainfall in inches in Nevada City, California, 1873 through
    1978. An example of time series data.

    Returns
    -------
     df : pandas DataFrame
        DataFrame indexed by ``year`` with column ``rainfall``.

    """
    fn = os.path.join(os.path.dirname(__file__), "data", "rainfall.csv")
    df = pd.read_csv(fn)
    df.set_index("year", inplace=True)
    return df




[docs]
def spatial_test_data(
    test: Literal["A", "B", "both"] = "both",
) -> pd.DataFrame | pd.Series:
    """Spatial Test Data.

    n = 26 children have each taken two tests of spatial ability,
    called A and B. Table 14.1 in An Introduction to the Bootstrap by
    Bradley Efron and Robert J. Tibshirani.

    Parameters
    ----------
     test : ["A", "B", "both"], optional
        Which test results to return. Defaults to "both".

    Returns
    -------
     df : pandas DataFrame or Series
        The data.

    """
    fn = os.path.join(os.path.dirname(__file__), "data", "spatial_test.csv")
    df = pd.read_csv(fn)
    df.set_index("Child", inplace=True)

    if test == "both":
        return df
    elif test in ["A", "B"]:
        return df[test]
    else:
        raise ValueError("Invalid test")




[docs]
def hormone_data() -> pd.DataFrame:
    """Hormone data from [ET93]_, Table 9.1.

    Amount in milligrams of anti-inflammatory hormone remaining in 27
    devices after a certain number of hours of wear. Devices were
    sampled from three manufacturing lots (A, B, C). Lot C appears to
    have greater remaining hormone but was worn the fewest hours; a
    regression analysis clarifies the situation.

    Returns
    -------
     df : pandas DataFrame
        Observations with columns for hours worn, manufacturing lot,
        and remaining hormone level.

    """
    fn = os.path.join(os.path.dirname(__file__), "data", "hormone_data.csv")
    df = pd.read_csv(fn)
    return df




[docs]
def patch_data() -> pd.DataFrame:
    """Patch data from [ET93]_, Table 10.1.

    Eight subjects wore medical patches designed to increase blood
    levels of a natural hormone. Each subject wore three patches: a
    placebo, an "old" patch from an established plant, and a "new"
    patch from a newly opened plant. Derived columns: ``z`` =
    oldpatch - placebo, ``y`` = newpatch - oldpatch. The purpose of
    the experiment was to show equivalence between the two plants.
    Chapter 25 of [ET93]_ has an extended analysis.

    Returns
    -------
     df : pandas DataFrame
        DataFrame indexed by ``subject`` with columns ``placebo``,
        ``oldpatch``, ``newpatch``, ``z``, and ``y``.

    """
    fn = os.path.join(os.path.dirname(__file__), "data", "patch.csv")
    df = pd.read_csv(fn)
    df.set_index("subject", inplace=True)
    df["z"] = df["oldpatch"] - df["placebo"]
    df["y"] = df["newpatch"] - df["oldpatch"]
    return df