From 085a7955d33df20dce486ac540cf934b9648a8e0 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Tue, 9 May 2023 15:34:59 +1000
Subject: [PATCH 01/20] Update docs of public API in `model_evaluation.py`

---
 cinspect/model_evaluation.py | 112 ++++++++++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 14 deletions(-)

diff --git a/cinspect/model_evaluation.py b/cinspect/model_evaluation.py
index a5e4c3a..96ef20e 100644
--- a/cinspect/model_evaluation.py
+++ b/cinspect/model_evaluation.py
@@ -22,6 +22,10 @@
 
 LOG = logging.getLogger(__name__)
 
+# TODO consolidate return types: there is currently redundancy:
+# we could either return the evaluations (and formalise the index-wise input-output correpondence),
+# or the evaluated evaluations, but probably not both
+
 
 def crossval_model(
     estimator: BaseEstimator,
@@ -30,21 +34,46 @@ def crossval_model(
     evaluators: Sequence[Evaluator],
     cv: Optional[
         Union[int, BaseCrossValidator]
-    ] = None,  # defaults to KFold(n_splits=5)
+    ] = 5,  # defaults to KFold(n_splits=5)
     random_state: Optional[Union[int, np.random.RandomState]] = None,
     stratify: Optional[Union[np.ndarray, pd.Series]] = None,
-    n_jobs=1,
-) -> Sequence[Evaluator]:
+    n_jobs: Optional[int] = 1,
+) -> Sequence[Tuple[Evaluator, Any]]:
     """
     Evaluate a model using cross validation.
 
     A list of evaluators determines what other metrics, such as feature
-    importance and partial dependence are computed
+    importance and partial dependence are computed.
+
+    
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        A scikit-learn estimator.
+    X : pd.DataFrame
+        The features.
+    y : Union[pd.Series, pd.DataFrame]
+        The target.
+    evaluators : Sequence[Evaluator]
+        A list of evaluators.
+    cv : Union[int, BaseCrossValidator], optional
+        The cross validation strategy, by default KFold(n_splits=5)
+    random_state : Union[int, np.random.RandomState], optional
+        The random state, by default None
+    stratify : Union[np.ndarray, pd.Series], optional
+        The stratification variable, by default None
+    n_jobs : int, optional
+        The number of jobs to run in parallel, by default 1
+    Returns
+    -------
+    Sequence[Tuple[Evaluator, Any]]
+        A sequence of evaluated Evaluators (corresponding to the input evaluators)
+        and their evaluations.
     """
+
     # Run various checks and prepare the evaluators
     random_state = check_random_state(random_state)
 
-    cv = 5 if cv is None else cv
     if isinstance(cv, int):
         cv = KFold(n_splits=cv, shuffle=True, random_state=random_state)
 
@@ -77,23 +106,49 @@ def bootstrap_model(
     random_state: Optional[Union[int, np.random.RandomState]] = None,
     use_group_cv: bool = False,
     n_jobs=1,
-) -> Sequence[Evaluator]:
+) -> Sequence[Tuple[Evaluator, Any]]:
     """
     Retrain a model using bootstrap re-sampling.
 
     A list of evaluators determines what statistics are computed with the
     bootstrap samples.
 
-    The same sample are passed into `fit` and `evaluate`.
+    The same samples are passed into `fit` and `evaluate`.
+
+    Stratification is supported as in `sklearn.utils.resample`.
 
     Parameters
     ----------
-    use_group_cv: bool
-        This inputs the indices of the re-sampled datasets into the estimator
+    estimator : BaseEstimator
+        A scikit-learn estimator.
+    X : pd.DataFrame
+        The features.
+    y : Union[pd.DataFrame, pd.Series]
+        The target.
+    evaluators : Sequence[Evaluator]
+        A list of evaluators.
+    replications : int, optional
+        The number of bootstrap replications, by default 100
+    subsample : float, optional
+        Approximate proportion of the data to sample, by default 1.0
+    stratify : Optional[Union[pd.Series, np.ndarray]], optional
+        The stratification variable, by default None
+    random_state : Optional[Union[int, np.random.RandomState]], optional
+        The random state, by default None
+    use_group_cv : bool, optional
+        If true, the function inputs the indices of the re-sampled datasets into the estimator
         as `estimator.fit(X_resample, y_resample, groups=indices_resample)`.
         This can only be used with e.g. `GridSearchCV` where `cv` is
         `GroupKFold`. This stops the same sample appearing in both the test and
-        training splits of any inner cross validation.
+        training splits of any inner cross validation. By default False
+    n_jobs : int, optional
+        The number of jobs to run in parallel, by default 1
+
+    Returns
+    -------
+    Sequence[[Tuple[Evaluator,Any]]
+        A sequence of evaluated Evaluators (corresponding to the input evaluators)
+        and their evaluations.
     """
     # Run various checks and prepare the evaluators
     n = len(X)
@@ -141,17 +196,46 @@ def bootcross_model(
     """
     Use bootstrapping to compute random train/test folds (no sample sharing).
 
-    A list of evaluators determines what statistics are computed with the
+    The input evaluators determines what statistics are computed with the
     crossed bootstrap samples.
 
     Parameters
     ----------
-    use_group_cv: bool
-        This inputs the indices of the re-sampled datasets into the estimator
+    estimator : BaseEstimator
+        A scikit-learn estimator.
+    X : pd.DataFrame
+        The features.
+    y : Union[pd.DataFrame, pd.Series]
+        The target.
+    evaluators : Sequence[Evaluator]
+        A list of evaluators.
+    replications : int, optional
+        The number of "bootcross" replications, by default 100
+    test_size : Union[int, float], optional
+        The approximate proportion (float in (0-1))
+        or count (int in [1,n])
+        of the data to be used for the test set, by default 0.25
+    random_state : Optional[Union[int, np.random.RandomState]], optional
+        The random state, by default None
+    use_group_cv : bool, optional
+        If true, the function inputs the indices of the re-sampled datasets into the estimator
         as `estimator.fit(X_resample, y_resample, groups=indices_resample)`.
         This can only be used with e.g. `GridSearchCV` where `cv` is
         `GroupKFold`. This stops the same sample appearing in both the test and
-        training splits of any inner cross validation.
+        training splits of any inner cross validation. By default False
+    n_jobs : int, optional
+        The number of jobs to run in parallel, by default 1
+
+    Returns
+    -------
+    Sequence[Tuple[Evaluator, Any]]
+        A sequence of evaluated Evaluators (corresponding to the input evaluators),
+        and their evaluations.
+
+    Raises
+    ------
+    ValueError
+        If `test_size` is not a float between (0, 1) or an int in [1, n].
     """
     random_state = check_random_state(random_state)
     n = len(X)

From b78f2e9e03a84b309dd23d28a3614aab44d374f8 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Tue, 9 May 2023 16:07:26 +1000
Subject: [PATCH 02/20] Update model evaluation docs: focus on bootcross

---
 cinspect/model_evaluation.py | 57 +++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/cinspect/model_evaluation.py b/cinspect/model_evaluation.py
index 96ef20e..ecef29a 100644
--- a/cinspect/model_evaluation.py
+++ b/cinspect/model_evaluation.py
@@ -45,7 +45,6 @@ def crossval_model(
     A list of evaluators determines what other metrics, such as feature
     importance and partial dependence are computed.
 
-    
     Parameters
     ----------
     estimator : BaseEstimator
@@ -70,7 +69,6 @@ def crossval_model(
         A sequence of evaluated Evaluators (corresponding to the input evaluators)
         and their evaluations.
     """
-
     # Run various checks and prepare the evaluators
     random_state = check_random_state(random_state)
 
@@ -193,8 +191,11 @@ def bootcross_model(
     use_group_cv: bool = False,
     n_jobs=1,
 ) -> Sequence[Tuple[Evaluator, Any]]:
-    """
-    Use bootstrapping to compute random train/test folds (no sample sharing).
+    """Use bootstrapping to compute random train/test folds (no sample sharing).
+
+    "Bootcross": split into train/test sets
+    then seperately resample these sets (once) with replacement.
+
 
     The input evaluators determines what statistics are computed with the
     crossed bootstrap samples.
@@ -267,9 +268,25 @@ def bootcross_model(
     return evaluators_evaluations
 
 
-def _check_group_estimator(estimator, use_group_cv):
+def _check_group_estimator(estimator : BaseEstimator
+                           , use_group_cv : bool) -> bool:
+    """Perform checks on the estimator and use_group_cv parameter.
+
+    If use_group_cv is True, warn the user if the estimator doesn't support groups.
+    If use_group_cv is False, warn the user if the estimator is a parameter search estimator.
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        A scikit-learn estimator.
+    use_group_cv : bool
+        Whether or not to use groups in cross validation procedure.
+    Returns
+    -------
+    bool
+        simply passes through `use_group_cv`
+    """
     if use_group_cv:
-        # Check if estimator supports group keyword
+        # Check if estimator supports groups keyword
         spec = inspect.signature(estimator.fit)
         if "groups" not in spec.parameters:
             LOG.warning(
@@ -288,10 +305,36 @@ def _check_group_estimator(estimator, use_group_cv):
     return use_group_cv
 
 
-def _bootcross_split(data_size, test_size, random_state):
+def _bootcross_split(data_size : int
+                     , test_size : int
+                     , random_state : np.random.RandomState
+                     ) -> Tuple[np.ndarray, np.ndarray]:
+    """Split indices for "bootcross".
+
+    Bootcross: split into train/test, then resample these sets (once) with replacement.
+
+    Parameters
+    ----------
+    data_size : int
+        number of samples to split
+    test_size : int
+        number of samples to use for test set
+    random_state : np.random.RandomState
+        random state
+
+    Returns
+    -------
+    Tuple[np.ndarray, np.ndarray]
+        train indices, test indices
+    """
+    assert test_size > 0 and test_size <= data_size
+
+    # random permutation of range(data_size)
     permind = random_state.permutation(data_size)
+    # split into test and train indices
     test_ind = permind[:test_size]
     train_ind = permind[test_size:]
+    # resample these train/test indices with replacement
     test_boot = resample(test_ind, random_state=random_state)
     train_boot = resample(train_ind, random_state=random_state)
     return train_boot, test_boot

From dc2f66c23380d1747a578cf107f739ced3a27ac8 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Tue, 9 May 2023 17:03:29 +1000
Subject: [PATCH 03/20] Update model_evalation.py: general train/test docs

---
 cinspect/model_evaluation.py | 122 +++++++++++++++++++++++++++++++++--
 1 file changed, 117 insertions(+), 5 deletions(-)

diff --git a/cinspect/model_evaluation.py b/cinspect/model_evaluation.py
index ecef29a..c88dde3 100644
--- a/cinspect/model_evaluation.py
+++ b/cinspect/model_evaluation.py
@@ -354,6 +354,37 @@ def _repeatedly_evaluate_model(
     n_jobs=1,
     name_for_logging: str = "Evaluation",
 ) -> Sequence[Tuple[Evaluator, Any]]:
+    """
+    Repeatedly evaluate a model on different train/test splits.
+
+    Optionally parallelises over the different train/test splits.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        Estimator to evaluate
+    X : pd.DataFrame
+        Features
+    y : Union[pd.DataFrame, pd.Series]
+        Target
+    train_test_indices_generator : Sequence[ Tuple[npt.ArrayLike, npt.ArrayLike]
+        Sequence of train/test index arrays (can be lazy, hence 'generator')
+    evaluators : Sequence[Evaluator]
+        Evaluators to use
+    use_group_cv : bool, optional
+        Whether to use group cross validation, by default False
+    random_state : Optional[Union[int, np.random.RandomState]], optional
+        Random state, by default None
+    n_jobs : int, optional
+        Number of jobs to use, using `joblib.Parallel` by default 1
+    name_for_logging : str, optional
+        Name to use for this procedure in logging, by default "Evaluation"
+
+    Returns
+    -------
+    Sequence[Tuple[Evaluator, Any]]
+        Input evaluators and their corresponding aggregated evaluations.
+    """
     # Runs code that requires the full set of data to be available For example
     # to select the range over which partial dependence should be shown.
 
@@ -408,14 +439,53 @@ def eval_iter_f(train_test_indices_tuple):
     return list(zip(evaluators, evaluations_combined))
 
 
-def _set_evaluators_evaluations(evaluators_and_their_evaluations):
+def _set_evaluators_evaluations(evaluators_and_their_evaluations : Sequence[Tuple[Evaluator, Any]]):
+    """
+    Set the evaluations on the evaluators. Mutates the input evaluators.
+
+    Parameters
+    ----------
+    evaluators_and_their_evaluations : Sequence[Tuple[Evaluator, Any]]
+        Evaluators and their corresponding evaluations.
+    """
     for tor, tion in evaluators_and_their_evaluations:
         tor.set_evaluation(tion)
 
 
 def _train_evaluate_model(
-    estimator, X, y, train_indices, test_indices, evaluator, use_group_cv=False
-):
+    estimator : BaseEstimator,
+    X : pd.DataFrame,
+    y : Union[pd.DataFrame, pd.Series],
+    train_indices : Sequence[int],
+    test_indices : Sequence[int],
+    evaluator : Evaluator,
+    use_group_cv : bool = False,
+) -> Evaluator.Evaluation:
+    """
+    Train and evaluate a model on a given train/test split.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        Estimator to evaluate
+    X : pd.DataFrame
+        Features
+    y : Union[pd.DataFrame, pd.Series]
+        Target
+    train_indices : Sequence[int]
+        Indices to use for training
+    test_indices : Sequence[int]
+        Indices to use for testing
+    evaluator : Evaluator
+        Evaluator to use
+    use_group_cv : bool, optional
+        Whether to use group cross validation, by default False
+
+    Returns
+    -------
+    Evaluator.Evaluation
+        Evaluation of the estimator on the given train/test split.
+    """
     est = _train_model(
         estimator=estimator,
         X=X,
@@ -439,7 +509,28 @@ def _train_model(
     y: Union[pd.DataFrame, pd.Series],
     train_indices: Sequence[int],
     use_group_cv: bool = False,
-):
+) -> BaseEstimator:
+    """
+    Train a model on a subset of the data. Mutates the input estimator.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        Estimator to train
+    X : pd.DataFrame
+        Features
+    y : Union[pd.DataFrame, pd.Series]
+        Target
+    train_indices : Sequence[int]
+        Indices of the training data
+    use_group_cv : bool, optional
+        Whether to use group cross validation, by default False
+
+    Returns
+    -------
+    BaseEstimator
+        Trained estimator
+    """
     group = _check_group_estimator(estimator, use_group_cv)
     X_train, y_train = get_rows(X, train_indices), get_rows(y, train_indices)
     if group:
@@ -455,7 +546,27 @@ def _evaluate_model(
     y: Union[pd.DataFrame, pd.Series],
     evaluator: Evaluator,
     test_indices: Sequence[int],
-):
+) -> Evaluator.Evaluation:
+    """Evaluate a pre-trained estimator on a subset of the data.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        Estimator to evaluate
+    X : pd.DataFrame
+        Features
+    y : Union[pd.DataFrame, pd.Series]
+        Target
+    evaluator : Evaluator
+        Evaluator to use
+    test_indices : Sequence[int]
+        Indices of the test data
+
+    Returns
+    -------
+    Evaluator.Evaluation
+        Evaluation of the trained estimator
+    """
     evaluation = evaluator.evaluate(
         estimator, get_rows(X, test_indices), get_rows(y, test_indices)
     )
@@ -467,5 +578,6 @@ def _evaluate_model(
 
 
 def _split_data(data, train_ind, test_ind):
+    """Split data into train and test sets, independently of the type of `data`."""
     data_r, data_s = get_rows(data, train_ind), get_rows(data, test_ind)
     return data_r, data_s

From 4270f2ba12a17e1a9cb18aa47ead51bfc6b53882 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Wed, 10 May 2023 11:22:00 +1000
Subject: [PATCH 04/20] Remove unnecessary raw docstrings in `stats.py`

---
 cinspect/stats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cinspect/stats.py b/cinspect/stats.py
index 91c5542..d1d7a1c 100644
--- a/cinspect/stats.py
+++ b/cinspect/stats.py
@@ -8,7 +8,7 @@
 
 
 def conditional_cov(X, Y, estimator=None, bias=False, ddof=None):
-    r"""Compute the conditional covariance, COV(Y|X).
+    """Compute the conditional covariance, COV(Y|X).
 
     This computes:
 
@@ -64,7 +64,7 @@ def conditional_cov(X, Y, estimator=None, bias=False, ddof=None):
 
 
 def conditional_corrcoef(X, Y, estimator=None):
-    r"""Compute the conditional correlation, CORR(Y|X).
+    """Compute the conditional correlation, CORR(Y|X).
 
     This is the normalised covariance,
 

From 4051c85bfe2c398fe4cd46296090d649dde7990d Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Wed, 10 May 2023 12:14:29 +1000
Subject: [PATCH 05/20] Correct docs of `evaluators.py`

---
 cinspect/evaluators.py | 88 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 9 deletions(-)

diff --git a/cinspect/evaluators.py b/cinspect/evaluators.py
index eb91ba4..79fc75d 100644
--- a/cinspect/evaluators.py
+++ b/cinspect/evaluators.py
@@ -19,7 +19,7 @@
 from sklearn.metrics import get_scorer
 
 from cinspect import dependence, importance
-from cinspect.utils import get_column
+from cinspect.utils import Bunch, get_column
 
 LOG = logging.getLogger(__name__)
 
@@ -171,7 +171,12 @@ def get_results(
         Returns
         -------
         Any
-            _description_
+            The stored Evaluation (subclasses may override this)
+        
+        Raises
+        ------
+        Exception
+            If no evaluation is available
         """
         if evaluation is None and hasattr(self, "evaluation"):
             evaluation = self.evaluation
@@ -296,8 +301,6 @@ class BinaryTreatmentEffect(Evaluator):
         Value of treatment variable when "treated" , by default 1
     control_val: Any, optional
         Value of treatment variable when "untreated", by default 0
-    evaluate_mode: str, optional
-        Evaluation mode; "train"/"test"/"all", by default "all"
     """
 
     # type of the Evaluation produced
@@ -675,6 +678,8 @@ def __init__(self, X, feature_name, grid_values="auto"):
 class PermutationImportanceEvaluator(Evaluator):
     """Permutation Importance Evaluator.
 
+    TODO Evaluation could/should be a bunch of lists, rather than a list of bunches
+
     Parameters
     ----------
     n_repeats: int
@@ -734,10 +739,31 @@ def __init__(
         self.scorer = scorer
         self.conditional_filter = conditional_filter
 
-    def prepare(self, estimator, X, y=None, random_state=None):
-        """Prepare the evaluator with model and data information.
+    def prepare(self, 
+                estimator: Estimator,
+                X : npt.ArrayLike, 
+                y : Optional[npt.ArrayLike]=None , 
+                random_state : RandomStateType=None):
+        """
+        Prepare the evaluator with model and data information. Mutates the Evaluators state.
 
         This is called by a model evaluation function in model_evaluation.
+
+        Parameters
+        ----------
+        estimator : Estimator
+            The estimator that will be evaluated
+        X : npt.ArrayLike
+            Training feature data
+        y : Optional[npt.ArrayLike], optional
+            Training target data, by default None
+        random_state : RandomStateType, optional
+            Random state, by default None
+
+        Raises
+        ------
+        ValueError
+            If column grouping is requested but grouping types are heterogeneous
         """
         if self.end_transform_indx is not None:
             transformer = clone(estimator[0 : self.end_transform_indx])
@@ -774,10 +800,28 @@ def prepare(self, estimator, X, y=None, random_state=None):
         self.n_original_columns = X.shape[1]
         self.random_state = random_state
 
-    def evaluate(self, estimator, X, y) -> List[np.ndarray]:
+    def evaluate(self, 
+                 estimator : Estimator, 
+                 X: npt.ArrayLike, 
+                 y: npt.ArrayLike) -> List[Bunch]:
         """Evaluate the fitted estimator with input data.
 
         This is called by a model evaluation function in model_evaluation.
+
+        Parameters
+        ----------
+        estimator: Estimator
+            The fitted estimator to evaluate
+        X: npt.ArrayLike
+            Feature data
+        y: npt.ArrayLike
+            Target data
+        
+        Returns
+        -------
+        List[Bunch]
+            A singleton list containing a Bunch that holds the permutation 
+            importance for each feature.
         """
         if self.end_transform_indx is not None:
             transformer = estimator[0 : self.end_transform_indx]
@@ -823,8 +867,19 @@ def evaluate(self, estimator, X, y) -> List[np.ndarray]:
 
         return [imprt.importances]
 
-    def aggregate(self, evaluations: Sequence[List[np.ndarray]]) -> List[np.ndarray]:
-        """Concatenate sequence of evaluations."""
+    def aggregate(self, evaluations: Sequence[List[Bunch]]) -> List[Bunch]:
+        """Concatenate sequence of evaluations.
+        
+        Parameters
+        ----------
+        evaluations: Sequence[List[`~sklearn.utils.Bunch`]]
+            Sequence of evaluations to concatenate
+        
+        Returns
+        -------
+        List[`~sklearn.utils.Bunch`]
+            Concatenated evaluations
+        """
         return _flatten(evaluations)
 
     def get_results(self, evaluation=None, ntop=10, name=None) -> mpl.figure.Figure:
@@ -860,6 +915,21 @@ def _get_column_indices_and_names(X, columns=None):
     ----------
     X: numpy array or pd.DataFrame
     columns: iterable of strings or ints
+
+
+    Returns
+    -------
+    indices: list of ints
+    names: list of strings
+    passed_by_name: bool
+        True if columns were specified by name, False if by index
+
+    Raises
+    ------
+    KeyError
+        If a specified column name is not in the data
+    ValueError
+        If columns are specified by name but the data does not have column names
     """
     # columns not specified - return all
     if columns is None:

From 8bd12df7c57d7d1826b18b4435fd4b50ab29ee4e Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Wed, 10 May 2023 13:02:15 +1000
Subject: [PATCH 06/20] Update `dependence.py` docs

---
 cinspect/dependence.py | 153 ++++++++++++++++++++++++++++-------------
 1 file changed, 104 insertions(+), 49 deletions(-)

diff --git a/cinspect/dependence.py b/cinspect/dependence.py
index b17acde..277064a 100644
--- a/cinspect/dependence.py
+++ b/cinspect/dependence.py
@@ -3,10 +3,12 @@
 """Partial dependence and individual conditional expectation functions."""
 
 import numbers
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Type, Union
 
+import matplotlib as mpl
 import matplotlib.pyplot as plt
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from scipy.stats.mstats import mquantiles
 
@@ -14,9 +16,11 @@
 IMAGE_TYPE = "png"
 
 
-def numpy2d_to_dataframe_with_types(X, columns, types):
+def numpy2d_to_dataframe_with_types(X : np.ndarray,
+                                    columns : List[str],
+                                    types : List[Type]) -> pd.DataFrame:
     """
-    Create a new dataframe with the specified column names and column types.
+     Create a new dataframe with the specified column names and column types.
 
     Example
     X = pd.DataFrame(...)
@@ -26,6 +30,24 @@ def numpy2d_to_dataframe_with_types(X, columns, types):
 
     Xnew = numpy2d_to_dataframe_with_types(values,df_columns,df_types)
 
+    Parameters
+    ----------
+    X : np.ndarray
+        Data to convert to dataframe
+    columns : List[str]
+        Column names
+    types : List[Type]
+        Column types
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with specified data, column names and types
+
+    Raises
+    ------
+    AssertionError
+        If the number of columns, column names and column types do not match
     """
     nxcols, ncols, ntypes = X.shape[1], len(columns), len(types)
     assert nxcols == ncols == ntypes, (
@@ -71,6 +93,13 @@ def individual_conditional_expectation(
     predictions: 2d ndarray
         the model predictions, where the specified feature is set to the
         corresponding value in grid_values
+
+    Raises
+    ------
+    ValueError
+        If predict_method is None and the model does not have a predict/predict_proba method
+    ValueError
+        If feature is not an integer and X is not a pandas DataFrame
     """
     if predict_method is None:
         if hasattr(model, "predict_proba"):
@@ -85,7 +114,6 @@ def predict_method(X):
                 "model does not support `predict_proba` or `predict` and no "
                 "alternate method specified."
             )
-
     input_df = False  # track if the predictor is expecting a dataframe
     if hasattr(X, "columns"):  # pandas DataFrame
         df_columns = X.columns
@@ -192,29 +220,33 @@ def construct_grid(
 
 
 def plot_partial_dependence_density(
-    ax, grid, density, feature_name, categorical, color="black", alpha=0.5
+    ax : plt.Axes,
+    grid : Sequence[float],
+    density : npt.ArrayLike,
+    feature_name : str,
+    categorical : bool,
+    color : str = "black",
+    alpha : float = 0.5,
 ) -> Tuple[np.ndarray, Union[np.ndarray, List[np.ndarray]]]:
     """
     Plot partial dependency on axes ax.
 
-    TODO next
-
     Parameters
     ----------
-    ax : _type_
-        _description_
-    grid : _type_
-        _description_
-    density : _type_
-        _description_
-    feature_name : _type_
-        _description_
-    categorical : _type_
-        _description_
+    ax : plt.Axes
+        Axes to plot on
+    grid : Sequence[float]
+        The grid values
+    density : npt.ArrayLike
+        The density values
+    feature_name : str
+        The name of the feature
+    categorical : bool
+        Whether the feature is categorical
     color : str, optional
-        _description_, by default "black"
+        The color of the plot bins, by default "black"
     alpha : float, optional
-        _description_, by default 0.5
+        The opacity of the plot bins, by default 0.5
 
     Returns
     -------
@@ -232,7 +264,7 @@ def plot_partial_dependence_density(
         even if no weighting or normalization is used.
 
     """
-    # plot the distribution for of the variable on the second axis
+    # plot the distribution of the variable on the second axis
     if categorical:
         x = np.arange(len(grid))
         ax.bar(x, density, color=color, alpha=alpha)
@@ -260,43 +292,60 @@ def plot_partial_dependence_density(
 
 
 def plot_partial_dependence_with_uncertainty(
-    grid,
-    predictions,
-    feature_name,
-    categorical=True,
-    density=None,
-    name=None,
-    mode="multiple-pd-lines",
-    ax=None,
-    color="black",
-    color_samples="grey",
-    alpha=0.3,
-    label=None,
-    ci_bounds=(0.025, 0.975),
-):
-    """Plot partial dependence plot with uncertainty estimates.
-
-    TODO: proper docstring.
+    grid : npt.ArrayLike,
+    predictions : List[np.ndarray],
+    feature_name : str,
+    categorical : bool = True,
+    density : npt.ArrayLike = None,
+    name : str = None,
+    mode : str = "multiple-pd-lines",
+    ax : plt.Axes = None,
+    color : str = "black",
+    color_samples : str = "grey",
+    alpha : float = 0.5,
+    label : str = None,
+    ci_bounds : Tuple[float] = (0.025, 0.975)
+) -> Tuple[mpl.Figure, dict]:
+    """
+    Plot partial dependence plot with uncertainty estimates.
 
     Parameters
     ----------
-    grid: np.array
+    grid : npt.ArrayLike
         Array of values of the feature for which the pdp values have been
         computed
-    predictions list[np.array]
-        List of ICE predictions, one from each fold. Each array is shaped
-        (num_samples, size_of_grid)
-    feature_name: str
-        The name of the feature
-    alpha: float
-        The alpha of the confidence region or multiple PD lines.
-    mode: str
-        One of -
+    predictions : List[np.ndarray]
+        List of ICE predictions, one from each fold.
+        Each array is shaped (num_samples, size_of_grid)
+    feature_name : str
+        The feature we are plotting the partial dependency on
+    categorical : bool, optional
+        Whether the feature is categorical, by default True
+    density : npt.ArrayLike, optional
+        The density values, by default None
+    name : str, optional
+        The dependent variable's name (used only for labels), by default None
+    mode : str, optional
+        One of:
             multiple-pd-lines - a PD line for each sample of data
             derivative - a derivative PD plot with mean and confidence
                 intervals.
-            interval - a PD plot with
-            ice-mu-sd
+            interval - a PD plot with confidence intervals
+            ice-mu-sd - a PD plot with ICE mean and standard deviation
+        By default "multiple-pd-lines"
+    ax : plt.Axes, optional
+        Axes to plot on, by default None.
+        Should not be passed if densiy is provided.
+    color : str, optional
+        Colour of the PD plot, by default "black"
+    color_samples : str, optional
+        Secondary colour, by default "grey"
+    alpha : float, optional
+        The alpha of the confidence region or multiple PD lines, by default 0.5
+    label : str, optional
+        The label for the PD plot, by default None
+    ci_bounds : Tuple[float], optional
+        The lower and upper bounds of the confidence interval, by default (0.025, 0.975)
 
     Returns
     -------
@@ -308,6 +357,12 @@ def plot_partial_dependence_with_uncertainty(
             derivative - "mean" and "lower" and "upper" confidence intervals
             interval - "mean" and "lower" and "upper" confidence intervals
             ice-mu-sd - "mean" and the "std" of the ICE plots
+    Raises
+    ------
+    ValueError
+        If ax is provided and density is not None
+    ValueError
+        If mode is not one of "multiple-pd-lines", "derivative", "interval", "ice-mu-sd".
     """
     res = {}
     fig = None

From 73fc8f668e447efec005b964c8a391487a4ed3db Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Wed, 10 May 2023 13:02:25 +1000
Subject: [PATCH 07/20] Lint `evaluators.py`

---
 cinspect/evaluators.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cinspect/evaluators.py b/cinspect/evaluators.py
index 79fc75d..dd45ed0 100644
--- a/cinspect/evaluators.py
+++ b/cinspect/evaluators.py
@@ -172,7 +172,7 @@ def get_results(
         -------
         Any
             The stored Evaluation (subclasses may override this)
-        
+
         Raises
         ------
         Exception
@@ -739,11 +739,11 @@ def __init__(
         self.scorer = scorer
         self.conditional_filter = conditional_filter
 
-    def prepare(self, 
+    def prepare(self,
                 estimator: Estimator,
-                X : npt.ArrayLike, 
-                y : Optional[npt.ArrayLike]=None , 
-                random_state : RandomStateType=None):
+                X : npt.ArrayLike,
+                y : Optional[npt.ArrayLike] = None,
+                random_state : RandomStateType = None):
         """
         Prepare the evaluator with model and data information. Mutates the Evaluators state.
 
@@ -800,9 +800,9 @@ def prepare(self,
         self.n_original_columns = X.shape[1]
         self.random_state = random_state
 
-    def evaluate(self, 
-                 estimator : Estimator, 
-                 X: npt.ArrayLike, 
+    def evaluate(self,
+                 estimator : Estimator,
+                 X: npt.ArrayLike,
                  y: npt.ArrayLike) -> List[Bunch]:
         """Evaluate the fitted estimator with input data.
 
@@ -816,11 +816,11 @@ def evaluate(self,
             Feature data
         y: npt.ArrayLike
             Target data
-        
+
         Returns
         -------
         List[Bunch]
-            A singleton list containing a Bunch that holds the permutation 
+            A singleton list containing a Bunch that holds the permutation
             importance for each feature.
         """
         if self.end_transform_indx is not None:
@@ -869,12 +869,12 @@ def evaluate(self,
 
     def aggregate(self, evaluations: Sequence[List[Bunch]]) -> List[Bunch]:
         """Concatenate sequence of evaluations.
-        
+
         Parameters
         ----------
         evaluations: Sequence[List[`~sklearn.utils.Bunch`]]
             Sequence of evaluations to concatenate
-        
+
         Returns
         -------
         List[`~sklearn.utils.Bunch`]

From 4ae0faa429666a3a2c63111a1d695cdbf4d1aebb Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Wed, 10 May 2023 13:10:13 +1000
Subject: [PATCH 08/20] Fix dependence, evaluator documentation bugs

---
 cinspect/dependence.py | 4 ++--
 cinspect/evaluators.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cinspect/dependence.py b/cinspect/dependence.py
index 277064a..fe77796 100644
--- a/cinspect/dependence.py
+++ b/cinspect/dependence.py
@@ -305,7 +305,7 @@ def plot_partial_dependence_with_uncertainty(
     alpha : float = 0.5,
     label : str = None,
     ci_bounds : Tuple[float] = (0.025, 0.975)
-) -> Tuple[mpl.Figure, dict]:
+) -> Tuple[mpl.figure.Figure, dict]:
     """
     Plot partial dependence plot with uncertainty estimates.
 
@@ -349,7 +349,7 @@ def plot_partial_dependence_with_uncertainty(
 
     Returns
     -------
-    fig: Figure
+    fig: `~mpl.figure.Figure`
         A figure of the partial dependence results
     res: dict
         A results dictionary, with keys depending on the mode:
diff --git a/cinspect/evaluators.py b/cinspect/evaluators.py
index dd45ed0..6d9ad9b 100644
--- a/cinspect/evaluators.py
+++ b/cinspect/evaluators.py
@@ -17,9 +17,10 @@
 from scipy.stats.mstats import mquantiles
 from sklearn.base import clone
 from sklearn.metrics import get_scorer
+from sklearn.utils import Bunch
 
 from cinspect import dependence, importance
-from cinspect.utils import Bunch, get_column
+from cinspect.utils import get_column
 
 LOG = logging.getLogger(__name__)
 
@@ -819,7 +820,7 @@ def evaluate(self,
 
         Returns
         -------
-        List[Bunch]
+        List[`~sklearn.utils.Bunch`]
             A singleton list containing a Bunch that holds the permutation
             importance for each feature.
         """

From 9c6930402f8943735de01273900c33ea451cf8c6 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Thu, 11 May 2023 13:34:55 +1000
Subject: [PATCH 09/20] Add parallelism to README

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6624f05..4568c4c 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,8 @@ pieval = PermutationImportanceEvaluator(n_repeats=5)
 # Bootstrap sample the data, re-fitting and re-evaluating the model each time.
 # This will run the GridSearchCV estimator, so thereby performing model
 # selection within each bootstrap sample.
-bootstrap_model(best_model, X, Y, [pdeval, pieval], replications=30)
+# n_jobs=-1 parallelises the bootstrapping to use all cores.
+bootstrap_model(best_model, X, Y, [pdeval, pieval], replications=30, n_jobs=-1)
 
 # Plot results
 pdeval.get_results(mode="interval")  # PD plot with confidence intervals

From 1000156171853acafb7c29416d30497b4f8aa14d Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Thu, 11 May 2023 15:34:11 +1000
Subject: [PATCH 10/20] Add simulations module to sphinx docs

---
 docs/source/{api/api.rst => cinspect/cinspect.rst} |  2 +-
 docs/source/{api => cinspect}/dependence.rst       |  0
 docs/source/{api => cinspect}/dimension.rst        |  0
 docs/source/{api => cinspect}/estimators.rst       |  0
 docs/source/{api => cinspect}/evaluators.rst       |  0
 docs/source/{api => cinspect}/importance.rst       |  0
 docs/source/{api => cinspect}/model_evaluation.rst |  0
 docs/source/{api => cinspect}/stats.rst            |  0
 docs/source/{api => cinspect}/utils.rst            |  0
 docs/source/index.rst                              |  3 ++-
 docs/source/simulations/collinear_sim.rst          |  7 +++++++
 docs/source/simulations/datagen.rst                |  7 +++++++
 docs/source/simulations/simple_sim.rst             |  7 +++++++
 docs/source/simulations/simulations.rst            | 11 +++++++++++
 14 files changed, 35 insertions(+), 2 deletions(-)
 rename docs/source/{api/api.rst => cinspect/cinspect.rst} (90%)
 rename docs/source/{api => cinspect}/dependence.rst (100%)
 rename docs/source/{api => cinspect}/dimension.rst (100%)
 rename docs/source/{api => cinspect}/estimators.rst (100%)
 rename docs/source/{api => cinspect}/evaluators.rst (100%)
 rename docs/source/{api => cinspect}/importance.rst (100%)
 rename docs/source/{api => cinspect}/model_evaluation.rst (100%)
 rename docs/source/{api => cinspect}/stats.rst (100%)
 rename docs/source/{api => cinspect}/utils.rst (100%)
 create mode 100644 docs/source/simulations/collinear_sim.rst
 create mode 100644 docs/source/simulations/datagen.rst
 create mode 100644 docs/source/simulations/simple_sim.rst
 create mode 100644 docs/source/simulations/simulations.rst

diff --git a/docs/source/api/api.rst b/docs/source/cinspect/cinspect.rst
similarity index 90%
rename from docs/source/api/api.rst
rename to docs/source/cinspect/cinspect.rst
index 3eb5dd4..7372e94 100644
--- a/docs/source/api/api.rst
+++ b/docs/source/cinspect/cinspect.rst
@@ -1,4 +1,4 @@
-API
+cinspect API Reference
 ===
 
 This is the application programming interface guide for Causal Inspection.
diff --git a/docs/source/api/dependence.rst b/docs/source/cinspect/dependence.rst
similarity index 100%
rename from docs/source/api/dependence.rst
rename to docs/source/cinspect/dependence.rst
diff --git a/docs/source/api/dimension.rst b/docs/source/cinspect/dimension.rst
similarity index 100%
rename from docs/source/api/dimension.rst
rename to docs/source/cinspect/dimension.rst
diff --git a/docs/source/api/estimators.rst b/docs/source/cinspect/estimators.rst
similarity index 100%
rename from docs/source/api/estimators.rst
rename to docs/source/cinspect/estimators.rst
diff --git a/docs/source/api/evaluators.rst b/docs/source/cinspect/evaluators.rst
similarity index 100%
rename from docs/source/api/evaluators.rst
rename to docs/source/cinspect/evaluators.rst
diff --git a/docs/source/api/importance.rst b/docs/source/cinspect/importance.rst
similarity index 100%
rename from docs/source/api/importance.rst
rename to docs/source/cinspect/importance.rst
diff --git a/docs/source/api/model_evaluation.rst b/docs/source/cinspect/model_evaluation.rst
similarity index 100%
rename from docs/source/api/model_evaluation.rst
rename to docs/source/cinspect/model_evaluation.rst
diff --git a/docs/source/api/stats.rst b/docs/source/cinspect/stats.rst
similarity index 100%
rename from docs/source/api/stats.rst
rename to docs/source/cinspect/stats.rst
diff --git a/docs/source/api/utils.rst b/docs/source/cinspect/utils.rst
similarity index 100%
rename from docs/source/api/utils.rst
rename to docs/source/cinspect/utils.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 81263ef..d4385be 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -10,7 +10,8 @@ Welcome to Causal Inspection's documentation!
    :maxdepth: 2
    :caption: Contents:
 
-   api/api
+   cinspect/cinspect
+   simulations/simulations
 
 
 
diff --git a/docs/source/simulations/collinear_sim.rst b/docs/source/simulations/collinear_sim.rst
new file mode 100644
index 0000000..3c4327a
--- /dev/null
+++ b/docs/source/simulations/collinear_sim.rst
@@ -0,0 +1,7 @@
+.. _collinear_sim:
+
+simulations.collinear_sim
+=========================
+
+.. automodule:: simulations.collinear_sim
+    :members:
diff --git a/docs/source/simulations/datagen.rst b/docs/source/simulations/datagen.rst
new file mode 100644
index 0000000..2d165b5
--- /dev/null
+++ b/docs/source/simulations/datagen.rst
@@ -0,0 +1,7 @@
+.. _datagen:
+
+simulations.datagen
+===================
+
+.. automodule:: simulations.datagen
+    :members:
diff --git a/docs/source/simulations/simple_sim.rst b/docs/source/simulations/simple_sim.rst
new file mode 100644
index 0000000..b20375e
--- /dev/null
+++ b/docs/source/simulations/simple_sim.rst
@@ -0,0 +1,7 @@
+.. _simple_sim:
+
+simulations.simple_sim
+======================
+
+.. automodule:: simulations.simple_sim
+    :members:
diff --git a/docs/source/simulations/simulations.rst b/docs/source/simulations/simulations.rst
new file mode 100644
index 0000000..bfa609b
--- /dev/null
+++ b/docs/source/simulations/simulations.rst
@@ -0,0 +1,11 @@
+simulations
+===========
+
+This is the documentation for the `simulations` module 
+
+.. toctree::
+    :maxdepth: 2
+    
+    collinear_sim
+    datagen
+    simple_sim 

From db5cc143ba3ba638fcb5df851aba758ef0effe1a Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Thu, 11 May 2023 15:36:57 +1000
Subject: [PATCH 11/20] Minor updates to cinspect docs

---
 cinspect/dependence.py       | 13 +++++-----
 cinspect/dimension.py        |  4 ++--
 cinspect/evaluators.py       | 46 +++++++++++++++++++++++-------------
 cinspect/model_evaluation.py |  3 +++
 4 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/cinspect/dependence.py b/cinspect/dependence.py
index fe77796..59e73d4 100644
--- a/cinspect/dependence.py
+++ b/cinspect/dependence.py
@@ -157,8 +157,8 @@ def construct_grid(
     -------
     grid, grid_counts: Tuple[np.ndarray, Optional[np.ndarray]]
         Constructed grid, and its counts of unique elements.
-        Returned grid_counts is not None iff
-            grid_values=="unique" or ("auto" and n_unique(v)>auto_threshold).
+        Returned grid_counts is not None if and only if
+        grid_values=="unique" or ("auto" and n_unique(v)>auto_threshold).
 
     Raises
     ------
@@ -327,11 +327,10 @@ def plot_partial_dependence_with_uncertainty(
         The dependent variable's name (used only for labels), by default None
     mode : str, optional
         One of:
-            multiple-pd-lines - a PD line for each sample of data
-            derivative - a derivative PD plot with mean and confidence
-                intervals.
-            interval - a PD plot with confidence intervals
-            ice-mu-sd - a PD plot with ICE mean and standard deviation
+            * multiple-pd-lines - a PD line for each sample of data
+            * derivative - a derivative PD plot with mean and confidence intervals.
+            * interval - a PD plot with confidence intervals
+            * ice-mu-sd - a PD plot with ICE mean and standard deviation
         By default "multiple-pd-lines"
     ax : plt.Axes, optional
         Axes to plot on, by default None.
diff --git a/cinspect/dimension.py b/cinspect/dimension.py
index 5b7adf4..b7ff21f 100644
--- a/cinspect/dimension.py
+++ b/cinspect/dimension.py
@@ -16,8 +16,8 @@ def effective_rank(X: Union[np.ndarray, pd.DataFrame]) -> float:
     In 2007 15th European Signal Processing Conference, 2007.
 
     Parameters:
-    X: 2d np.array
-        The feature matrix
+    X: np.array
+        The 2d feature matrix
 
     Returns
     -------
diff --git a/cinspect/evaluators.py b/cinspect/evaluators.py
index 6d9ad9b..783d3c2 100644
--- a/cinspect/evaluators.py
+++ b/cinspect/evaluators.py
@@ -37,7 +37,7 @@ class Evaluator:
     """Abstract class for Evaluators to inherit from.
 
     Each subclass should have an associated Evaluation type.
-    This should be a monoid, where :method:`Evaluator.aggregate` is the monoid operation.
+    This should be a monoid, where :meth:`Evaluator.aggregate` is the monoid operation.
 
     Internal state should be this Evaluation; should be initialised with the Monoidal identity
 
@@ -72,7 +72,8 @@ def prepare(self,
             Features used for preparation (sub-class dependent semantics).
             Shape (n_features, n_rows)
         y : Optional[npt.ArrayLike], optional
-            Optional targets used for preparation, of shape (n_samples, n_targets), by default None.
+            Optional targets used for preparation, of shape `(n_samples, n_targets)`,
+            by default None.
         random_state : RandomStateType, optional
             Random state, by default None
         """
@@ -94,9 +95,9 @@ def evaluate(self,
         estimator : Estimator
             An sklearn estimator
         X : npt.ArrayLike
-            Features, of shape (n_samples, n_features)
+            Features, of shape `(n_samples, n_features)`
         y : Optional[npt.ArrayLike], optional
-            Optional targets, of shape (n_samples, n_targets), by default None
+            Optional targets, of shape `(n_samples, n_targets)`, by default None
 
         Returns
         -------
@@ -113,13 +114,24 @@ def aggregate(self, evaluations: Sequence[Evaluation]) -> Evaluation:
         and is crucial for parallelisation.
 
         Evaluation should be a monoid with respect to this operation for sane behaviour:
-          - identity:
-            - aggregate([]) == unit
-            - aggregate( [unit] + evals ) == aggregate(evals) == aggregate(evals + [unit])
-          - associative:
-            - aggregate(aggregate([a]), aggregate([b,c])
-              == aggregate([a,b,c])
-              == aggregate(aggregate([a,b ]), aggregate([c])
+
+        * identity:
+            * aggregate([]) == unit
+            * and::
+
+                aggregate( [unit] + evals )
+
+                == aggregate(evals)
+
+                == aggregate(evals + [unit])
+        * associativity: ::
+
+            aggregate(aggregate([a]), aggregate([b,c])
+
+            == aggregate([a,b,c])
+
+            == aggregate(aggregate([a,b]), aggregate([c])
+
 
         TODO examples
         e.g. Evaluation could be a list of statistics, could be (mean, count) of a statistic,
@@ -240,9 +252,9 @@ def evaluate(self,
         estimator : Estimator
             An sklearn estimator
         X : npt.ArrayLike
-            Features, of shape (n_samples, n_features)
+            Features, of shape `(n_samples, n_features)`
         y : Optional[npt.ArrayLike], optional
-            Optional targets, of shape (n_samples, n_targets), by default None
+            Optional targets, of shape `(n_samples, n_targets)`, by default None
 
         Returns
         -------
@@ -361,9 +373,9 @@ def evaluate(self,
         estimator : Estimator
             An sklearn estimator
         X : npt.ArrayLike
-            Features, of shape (n_samples, n_features)
+            Features, of shape `(n_samples, n_features)`
         y : npt.ArrayLike, optional
-            Unused targets, of shape (n_samples, n_targets), by default None
+            Unused targets, of shape `(n_samples, n_targets)`, by default None
 
         Returns
         -------
@@ -529,9 +541,9 @@ def evaluate(self,
         estimator : Estimator
             An sklearn estimator
         X : npt.ArrayLike
-            Features, of shape (n_samples, n_features)
+            Features, of shape `(n_samples, n_features)`
         y : Optional[npt.ArrayLike]
-            targets, of shape (n_samples, n_targets), by default None
+            targets, of shape `(n_samples, n_targets)`, by default None
 
         Returns
         -------
diff --git a/cinspect/model_evaluation.py b/cinspect/model_evaluation.py
index c88dde3..4957abc 100644
--- a/cinspect/model_evaluation.py
+++ b/cinspect/model_evaluation.py
@@ -114,6 +114,7 @@ def bootstrap_model(
     The same samples are passed into `fit` and `evaluate`.
 
     Stratification is supported as in `sklearn.utils.resample`.
+    Mutates the evaluators in place, as well as returning them.
 
     Parameters
     ----------
@@ -200,6 +201,8 @@ def bootcross_model(
     The input evaluators determines what statistics are computed with the
     crossed bootstrap samples.
 
+    Mutates the evaluators in place, as well as returning them.
+
     Parameters
     ----------
     estimator : BaseEstimator

From 0afb584255e47e322cd40a568712499e03121778 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Thu, 11 May 2023 15:37:58 +1000
Subject: [PATCH 12/20] Update datagen docs

---
 simulations/collinear_sim.py |   6 +-
 simulations/datagen.py       | 182 ++++++++++++++++++++++++++++-------
 2 files changed, 152 insertions(+), 36 deletions(-)

diff --git a/simulations/collinear_sim.py b/simulations/collinear_sim.py
index 5b0e833..9dec5ea 100644
--- a/simulations/collinear_sim.py
+++ b/simulations/collinear_sim.py
@@ -29,12 +29,12 @@
 TRUE_ATE = 0.3
 
 
-def make_data() -> Tuple[ArrayLike, ArrayLike]:
+def make_data() -> Tuple[pd.DataFrame, np.ndarray]:
     """Construct collinear simulation data.
 
     Returns
     -------
-    (X, y) : Tuple[ArrayLike, ArrayLike]
+    (X, y) : Tuple[pd.DataFrame, np.ndarray]
         (features, target)
     """
     n = 500
@@ -56,7 +56,7 @@ def load_synthetic_data():
 
     Returns
     -------
-    (X, y) : Tuple[ArrayLike, ArrayLike]
+    (X, y) : Tuple[pd.DataFrame, np.ndarray]
         (features, target)
     """
     data_file = "../data/synthetic_data.csv"
diff --git a/simulations/datagen.py b/simulations/datagen.py
index f9e69e3..0a8d858 100644
--- a/simulations/datagen.py
+++ b/simulations/datagen.py
@@ -16,7 +16,17 @@ def generate_sythetic_approximation(X: np.ndarray) -> np.ndarray:
     This is a quick hack, to generate data with at least some properties of the original.
     The covariance matrix is used to capture all the relationships between variables,
     regardless of whether they are continuous or categorical.
-    """
+
+    Parameters
+    ----------
+    X : np.ndarray
+        Input data
+
+    Returns
+    -------
+    np.ndarray
+        Approximated data, with same shape as X
+    """    
     c = X.T @ X
     Xs = np.random.multivariate_normal(X.mean(axis=0), c, size=len(X))
     for col in range(X.shape[1]):
@@ -27,34 +37,34 @@ def generate_sythetic_approximation(X: np.ndarray) -> np.ndarray:
 
 
 class DGPGraph:
-    """A high level Interface for building Bayesian network data generating processes.
+    """A high level interface for building Bayesian network data generating processes.
 
     Example
     -------
-    ```
-    alpha = 0.3
-    n_x = 30
-    support_size = 5
-    coefs_T = np.zeros(n_x)
-    coefs_T[0:support_size] = np.random.normal(1, 1, size=support_size)
+    .. code-block:: python
 
-    coefs_Y = np.zeros(n_x)
-    coefs_Y[0:support_size] = np.random.uniform(0, 1, size=support_size)
+        alpha = 0.3
+        n_x = 30
+        support_size = 5
+        coefs_T = np.zeros(n_x)
+        coefs_T[0:support_size] = np.random.normal(1, 1, size=support_size)
 
-    def fX(n):
-        return np.random.normal(0, 1, size=(n, n_x))
+        coefs_Y = np.zeros(n_x)
+        coefs_Y[0:support_size] = np.random.uniform(0, 1, size=support_size)
 
-    def fT(X, n):
-        return X @ coefs_T + np.random.uniform(-1, 1, size=n)
+        def fX(n):
+            return np.random.normal(0, 1, size=(n, n_x))
 
-    def fY(X, T, n):
-        return alpha * T + X @ coefs_Y + np.random.uniform(-1, 1, size=n)
+        def fT(X, n):
+            return X @ coefs_T + np.random.uniform(-1, 1, size=n)
 
-    dgp = DGPGraph()
-    dgp.add_node("X", fX)
-    dgp.add_node("T", fT, parents=["X"])
-    dgp.add_node("Y", fY, parents=["X", "T"])
-    ```
+        def fY(X, T, n):
+            return alpha * T + X @ coefs_Y + np.random.uniform(-1, 1, size=n)
+
+        dgp = DGPGraph()
+        dgp.add_node("X", fX)
+        dgp.add_node("T", fT, parents=["X"])
+        dgp.add_node("Y", fY, parents=["X", "T"])
     """
 
     # TODO add some tracking to keep track of shape of variables & warn if problems arise
@@ -77,10 +87,10 @@ def add_node(self, name, sample_func, parents=None, standardise=False):
         sample_func: function(*tensors) -> np.ndarray
             The sampling function for pyro.sample
 
-        parents: (optional) [str]
+        parents: [str], optional
             A list of the parents of this node. If None, node must be a root node.
 
-        standardise: (optional) bool
+        standardise: bool, optional
             Should the value of this node be automatically scaled & centered. Default False.
 
         """
@@ -94,11 +104,28 @@ def add_node(self, name, sample_func, parents=None, standardise=False):
         self.shapes = self._check_func_returns()
 
     def get_function(self, node):
-        """Return the function for generating data for a node given its parents."""
+        """Return the function for generating data for a node given its parents.
+        
+        Returns
+        -------
+        function
+            The function for generating data for a node given its parents.
+        """
         return self.nodes[node][0]
 
     def get_parents(self, node):
-        """Return the list of parents for the given node or an empty list if there are none."""
+        """Return the list of parents for the given node or an empty list if there are none.
+        
+        Parameters
+        ----------
+        node: str
+            The name of the node
+        
+        Returns
+        -------
+        list
+            The list of parents for the given node or an empty list if there are none.
+        """
         if node in self.parents:
             return self.parents[node]
         return []
@@ -147,7 +174,10 @@ def _expand(self, node, value, n):
         return v * value
 
     def draw_graph(self):
-        """Draw the DAG for the data generating process."""
+        """Draw the DAG for the data generating process.
+        
+        Uses networkx.
+        """
         nx.draw(self.graph, with_labels=True)
 
     def sample(self, n, interventions=None):
@@ -184,7 +214,26 @@ def sample(self, n, interventions=None):
         return values
 
     def ate(self, n, treatment_node, outcome_node, treatment_val=1, control_val=0):
-        """Compute the estimated Average Treatment Effect based on a sample of size n."""
+        """Compute the estimated Average Treatment Effect based on a sample of size n.
+        
+        Parameters
+        ----------
+        n: int
+            The number of samples to draw for the estimate
+        treatment_node: str
+            The name of the treatment node
+        outcome_node: str
+            The name of the outcome node
+        treatment_val: float, optional
+            The value of the treatment to use for the intervention. Default 1.
+        control_val: float, optional
+            The value of the control to use for the intervention. Default 0.
+        
+        Returns
+        -------
+        ate: float
+            The estimated Average Treatment Effect
+        """
         s1 = self.sample(n, interventions={treatment_node: treatment_val})
         s0 = self.sample(n, interventions={treatment_node: control_val})
         ate = s1[outcome_node].mean() - s0[outcome_node].mean()
@@ -200,8 +249,37 @@ def cate(
         condition_values,
         treatment_val=1,
         control_val=0,
-    ):
-        """Compute the estimated Conditional Average Treatment Effect from a sample size n."""
+    ) -> np.ndarray:
+        """Compute the estimated Conditional Average Treatment Effect from a sample size n.
+        
+        Multiple condition values can be passed and the CATE will be computed for each.
+        Parameters
+        ----------
+        n: int
+        treatment_node: str
+            The name of the treatment node
+        outcome_node: str
+            The name of the outcome node
+        condition_node: str
+            The name of the node to condition on
+        condition_values: list
+            The values of the condition node to condition on (one at a time)
+        treatment_val: float, optional
+            The value of the treatment to use for the intervention. Default 1.
+        control_val=0: float, optional
+            The value of the control to use for the intervention. Default 0.
+
+        Returns
+        -------
+        cate: np.ndarray
+            The estimated Conditional Average Treatment Effect for each condition value.
+            Shape (len(condition_values),)
+
+        Raises
+        ------
+        NotImplementedError
+            If the condition node is not a root node or has dimensionality > 1.
+        """
         condition_shape = self.shapes[condition_node]
         if len(condition_shape) > 1:
             raise NotImplementedError(
@@ -235,8 +313,9 @@ def simple_triangle(
     n_x=30,
     support_size=5,
     random_state=None
-):
-    """Make a simple triangle model.
+) -> DGPGraph:
+    """
+    Make a simple triangle model.
 
     This is just a simple "triangle" model with linear relationships.
     X: confounding factors
@@ -245,7 +324,26 @@ def simple_triangle(
 
     Casual relationships are X->T, X->Y, T->Y.
 
-    """
+    Confounders are iid standard normal, unlike in :meth:`collinear_triangle`.
+
+    Parameters
+    ----------
+    alpha : float
+        Coefficient for the true causal effect of T on Y
+    binary_treatment : bool, optional
+        Whether the treatment is binary, by default False
+    n_x : int, optional
+        The number of confounding factors, by default 30
+    support_size : int, optional
+        The number of confounding factors that have non-zero X->T and X->Y coefficients, by default 5
+    random_state : random state | seed, optional
+        Random state, by default None
+
+    Returns
+    -------
+    DGPGraph
+        The generated graph
+    """    
     rng = check_random_state(random_state)
     coefs_T = np.zeros(n_x)
     coefs_T[0:support_size] = rng.normal(1, 1, size=support_size)
@@ -279,7 +377,8 @@ def collinear_confounders(
     latent_dim=5,
     random_state=None
 ):
-    """Make a triangle model with many collinear confounding variables.
+    """
+    Make a triangle model with many collinear confounding variables.
 
     This is just a simple "triangle" model with linear relationships.
     X: confounding factors
@@ -288,6 +387,23 @@ def collinear_confounders(
 
     Casual relationships are X->T, X->Y, T->Y.
 
+    Parameters
+    ----------
+    true_ate : float
+        The true causal effect of T on Y
+    binary_treatment : bool, optional
+        Whether the treatment is binary, by default False
+    confounder_dim : int, optional
+        The number of confounding factors, by default 200
+    latent_dim : int, optional
+        The number of latent dimensions for the confounders, by default 5
+    random_state : random state | seed, optional
+        Random state, by default None
+
+    Returns
+    -------
+    DGPGraph
+        The generated graph
     """
     rng = check_random_state(random_state)
     # Confounder latent distribution

From 53207c533347880f1d523c676c4cd19ba599e831 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Thu, 11 May 2023 15:38:29 +1000
Subject: [PATCH 13/20] Linting

---
 cinspect/dimension.py          |  5 ++---
 simulations/collinear_sim.py   |  9 ++++-----
 simulations/datagen.py         |  2 +-
 simulations/simple_sim.py      | 12 +++++-------
 tests/test_estimators.py       |  1 +
 tests/test_model_evaluation.py | 12 ++++++------
 6 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/cinspect/dimension.py b/cinspect/dimension.py
index b7ff21f..0366e97 100644
--- a/cinspect/dimension.py
+++ b/cinspect/dimension.py
@@ -2,9 +2,10 @@
 # Licensed under the Apache 2.0 License.
 """Methods for reducing or understanding the dimensionality of a matrix."""
 
+from typing import Callable, List, Optional, Tuple, Union
+
 import numpy as np
 import pandas as pd
-from typing import Callable, Tuple, List, Optional, Union
 
 
 def effective_rank(X: Union[np.ndarray, pd.DataFrame]) -> float:
@@ -23,8 +24,6 @@ def effective_rank(X: Union[np.ndarray, pd.DataFrame]) -> float:
     -------
     erank: float
         The effective rank (will always be between 1 and rank(X))
-
-
     """
     u, s, v = np.linalg.svd(X.T @ X)
     norm_s = np.abs(s).sum()
diff --git a/simulations/collinear_sim.py b/simulations/collinear_sim.py
index 9dec5ea..0227378 100644
--- a/simulations/collinear_sim.py
+++ b/simulations/collinear_sim.py
@@ -7,17 +7,16 @@
 
 import numpy as np
 import pandas as pd
-from cinspect.dimension import effective_rank
-from cinspect.estimators import BinaryTreatmentRegressor
-from cinspect.evaluators import BinaryTreatmentEffect
-from cinspect.model_evaluation import bootstrap_model, crossval_model
 from numpy.typing import ArrayLike
 from sklearn.base import clone
-
 # from sklearn.base import clone # required if we add *best* ridge regressor back in
 from sklearn.linear_model import LinearRegression, Ridge
 from sklearn.model_selection import GridSearchCV, RepeatedKFold, ShuffleSplit
 
+from cinspect.dimension import effective_rank
+from cinspect.estimators import BinaryTreatmentRegressor
+from cinspect.evaluators import BinaryTreatmentEffect
+from cinspect.model_evaluation import bootstrap_model, crossval_model
 from simulations.datagen import collinear_confounders
 
 # Logging
diff --git a/simulations/datagen.py b/simulations/datagen.py
index 0a8d858..e129eb7 100644
--- a/simulations/datagen.py
+++ b/simulations/datagen.py
@@ -2,8 +2,8 @@
 # Licensed under the Apache 2.0 License.
 """Data generation classed for causal simulations."""
 
-import numpy as np
 import networkx as nx
+import numpy as np
 from scipy.special import expit
 from sklearn.kernel_approximation import RBFSampler
 from sklearn.utils import check_random_state
diff --git a/simulations/simple_sim.py b/simulations/simple_sim.py
index 60f79b6..91ebf5d 100644
--- a/simulations/simple_sim.py
+++ b/simulations/simple_sim.py
@@ -7,16 +7,14 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from cinspect.evaluators import (
-    PartialDependanceEvaluator,
-    PermutationImportanceEvaluator,
-)
-from cinspect.model_evaluation import bootcross_model
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.impute import SimpleImputer
-from sklearn.pipeline import make_pipeline
 from sklearn.model_selection import GridSearchCV, GroupKFold
+from sklearn.pipeline import make_pipeline
 
+from cinspect.evaluators import (PartialDependanceEvaluator,
+                                 PermutationImportanceEvaluator)
+from cinspect.model_evaluation import bootcross_model
 from simulations.datagen import simple_triangle
 
 # Logging
@@ -83,7 +81,7 @@ def main():
     pdeval = PartialDependanceEvaluator(feature_grids={"T": "auto"})
     pieval = PermutationImportanceEvaluator(n_repeats=5)
     bootcross_model(
-        model, X, Y, [pdeval, pieval], replications=10, use_group_cv=True
+        model, X, Y, [pdeval, pieval], replications=10, use_group_cv=True, n_jobs=-1
     )  # To make sure we pass use GroupKFold
 
     pdeval.get_results(mode="interval")
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
index 424eadb..ca5e0bc 100644
--- a/tests/test_estimators.py
+++ b/tests/test_estimators.py
@@ -6,6 +6,7 @@
 import numpy as np
 from sklearn.linear_model import Ridge
 from sklearn.model_selection import GridSearchCV, GroupKFold
+
 from cinspect.estimators import BinaryTreatmentRegressor
 from simulations.datagen import simple_triangle
 
diff --git a/tests/test_model_evaluation.py b/tests/test_model_evaluation.py
index 3895ff6..a3ad928 100644
--- a/tests/test_model_evaluation.py
+++ b/tests/test_model_evaluation.py
@@ -11,9 +11,8 @@
 import hypothesis.strategies as hst
 import numpy as np
 import pytest
-from cinspect.evaluators import Evaluator, ScoreEvaluator
-from cinspect.model_evaluation import (_bootcross_split, bootcross_model,
-                                       bootstrap_model, crossval_model)
+import test_utils
+import testing_strategies
 from hypothesis import given
 from numpy.random.mtrand import RandomState
 from sklearn.base import BaseEstimator
@@ -22,8 +21,9 @@
 from sklearn.model_selection._split import LeaveOneOut, TimeSeriesSplit
 from sklearn.utils.validation import check_random_state
 
-import test_utils
-import testing_strategies
+from cinspect.evaluators import Evaluator, ScoreEvaluator
+from cinspect.model_evaluation import (_bootcross_split, bootcross_model,
+                                       bootstrap_model, crossval_model)
 
 logger = logging.getLogger()
 
@@ -506,7 +506,7 @@ def _test_invariance_to_n_jobs(fn, n_jobs=-1, *args, **kwargs):
 @hyp.settings(deadline=None)
 @given(data=_default_crossval_data_strategy(n_jobs=hst.sampled_from([2])))
 def test_crossval_parallelism(data):
-    """Tests that n_jobs doesn't affect."""
+    """Tests that n_jobs doesn't affect crossval_model."""
     try:
         _test_invariance_to_n_jobs(
             crossval_model,

From b3290dd0cb88f79e3b67602108f95d5b1cb60614 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Fri, 12 May 2023 14:51:11 +1000
Subject: [PATCH 14/20] Add docs generation shortcut to Makefile

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 6015848..fa3ad12 100644
--- a/Makefile
+++ b/Makefile
@@ -30,3 +30,7 @@ test:
 
 test-ci:
 	pytest . --cov=cinspect tests/ --hypothesis-profile "ci"
+
+# shortcut for making html docs
+doc:
+	$(MAKE) html -C docs
\ No newline at end of file

From d521cac9b0fd634ced1086dd04a30f9d0e27ca4d Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Fri, 12 May 2023 14:51:52 +1000
Subject: [PATCH 15/20] Proofread, reformatting for sphinx

---
 cinspect/dependence.py       | 10 +++++---
 cinspect/dimension.py        |  5 ++--
 cinspect/estimators.py       |  9 +++++--
 cinspect/evaluators.py       | 46 ++++++++++++++++++------------------
 cinspect/model_evaluation.py | 11 +++++----
 cinspect/stats.py            |  4 ++--
 cinspect/utils.py            |  3 ++-
 docs/source/conf.py          |  4 ++++
 simulations/datagen.py       |  4 +++-
 9 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/cinspect/dependence.py b/cinspect/dependence.py
index 59e73d4..0f4c05c 100644
--- a/cinspect/dependence.py
+++ b/cinspect/dependence.py
@@ -2,6 +2,10 @@
 # Licensed under the Apache 2.0 License.
 """Partial dependence and individual conditional expectation functions."""
 
+# defers evaluation of annotations so sphinx can parse type aliases rather than
+# their expanded forms
+from __future__ import annotations
+
 import numbers
 from typing import List, Optional, Sequence, Tuple, Type, Union
 
@@ -83,7 +87,7 @@ def individual_conditional_expectation(
         compute the curve. if an int is passed uses a linear grid of length
         grid_values from the minimum to the maximum.
 
-    predict_method: callable method on model (optional)
+    predict_method: callable method on model, optional
         The method to call to predict.
         Defaults to predict_proba for classifiers and predict for regressors.
 
@@ -250,7 +254,7 @@ def plot_partial_dependence_density(
 
     Returns
     -------
-    bins: : np.ndarray
+    bins: np.ndarray
         The edges of the bins. Length nbins + 1 (nbins left edges and right edge of last bin).
         Always a single array even when multiple data sets are passed in.
 
@@ -348,7 +352,7 @@ def plot_partial_dependence_with_uncertainty(
 
     Returns
     -------
-    fig: `~mpl.figure.Figure`
+    fig: :class:`mpl.figure.Figure`
         A figure of the partial dependence results
     res: dict
         A results dictionary, with keys depending on the mode:
diff --git a/cinspect/dimension.py b/cinspect/dimension.py
index 0366e97..2298249 100644
--- a/cinspect/dimension.py
+++ b/cinspect/dimension.py
@@ -16,7 +16,8 @@ def effective_rank(X: Union[np.ndarray, pd.DataFrame]) -> float:
     "The effective rank: A measure of effective dimensionality."
     In 2007 15th European Signal Processing Conference, 2007.
 
-    Parameters:
+    Parameters
+    ----------
     X: np.array
         The 2d feature matrix
 
@@ -50,7 +51,7 @@ def greedy_feature_selection(
     maximise_metric: Callable
         A function that takes X and returns a number.
 
-    initial_col: Optional(int)
+    initial_col: int, optional
         If set, the selected features will be initialised with this column.
 
     num_to_select: int
diff --git a/cinspect/estimators.py b/cinspect/estimators.py
index dfb9ff9..91357d3 100644
--- a/cinspect/estimators.py
+++ b/cinspect/estimators.py
@@ -2,6 +2,11 @@
 # Licensed under the Apache 2.0 License.
 """Convenience estimators for causal estimation."""
 
+# defers evaluation of annotations so sphinx can parse type aliases rather than
+# their expanded forms
+
+from __future__ import annotations
+
 from typing import Any, NamedTuple, Optional, Union  # , Self
 
 import numpy as np
@@ -168,7 +173,7 @@ class BinaryTreatmentRegressor(BaseEstimator, RegressorMixin):
     treatment_column: Union[str, int]
         Treatment column index
         TODO: str only if it's a dataframe
-    treatment_val: Optional[Any], default 1
+    treatment_val: Any, optional
         Constant value of treatment column
         which denotes that the current row is in the treatment cohort
         TODO example
@@ -273,7 +278,7 @@ def set_params(self, **params: dict) -> Any:  # TODO use Self: PEP 673, Python 3
         """
         Set the parameters of this estimator.
 
-        This is a method of :class:`~sklearn.base.BaseEstimator`.
+        This is a method of :class:`sklearn.base.BaseEstimator`.
 
         TODO satisfy the following:
 
diff --git a/cinspect/evaluators.py b/cinspect/evaluators.py
index 783d3c2..8da57ce 100644
--- a/cinspect/evaluators.py
+++ b/cinspect/evaluators.py
@@ -1,6 +1,9 @@
 # Copyright (c) Gradient Institute. All rights reserved.
 # Licensed under the Apache 2.0 License.
 """Result evaluator classes."""
+# defers evaluation of annotations so sphinx can parse type aliases rather than
+# their expanded forms
+from __future__ import annotations
 
 import functools
 import logging
@@ -27,9 +30,9 @@
 # TODO sphinx documentation of custom types/type aliases
 Estimator = TypeVar("Estimator")  # intention is an sklearn estimator
 
-# random states as per sklearn
 # https://scikit-learn.org/dev/glossary.html#term-random_state
 # TODO sphinx documentation
+"""Type for random state, as per sklearn."""
 RandomStateType = Optional[Union[int, np.random.RandomState]]
 
 
@@ -45,6 +48,8 @@ class Evaluator:
     Mostly... Evaluator holds metadata for its Evaluation.
     Liskov substitution principle suggests that subtypes should be swappable;
     this is not currently true because we can't enforce the behaviour of the objects' consumers
+
+    
     """
 
     Evaluation = TypeVar("Evaluation")
@@ -59,11 +64,6 @@ def prepare(self,
 
         This is called by a model evaluation function in model_evaluation.
 
-        Parameters
-        ----------
-         Optional[Union[int, np.random.RandomState]], optional
-            Random state, by default RandomStateType
-
         Parameters
         ----------
         estimator : Estimator
@@ -153,7 +153,7 @@ def aggregate(self, evaluations: Sequence[Evaluation]) -> Evaluation:
     def set_evaluation(self, evaluation: Evaluation) -> None:
         """Setter; sets this object's evaluation.
 
-        Subclasses should ensure that this and self.prepare(..)
+        Subclasses should ensure that this and :meth:`self.prepare`
         are the only ways to modify internal state.
 
 
@@ -210,7 +210,7 @@ class ScoreEvaluator(Evaluator):
     scorers: list[str|Scorer]
         List of scorers/scorer names to compute.
         Names -> scorer correspondence dictated by `sklearn.metrics.get_scorer`
-    groupby: (optional) str or list[str]
+    groupby:  str or list[str], optional
         List or string indicating that scores should be calculated
         separately within groups defined by this/these variables.
         TODO: this is currently implemented implicitly using pandas
@@ -253,7 +253,7 @@ def evaluate(self,
             An sklearn estimator
         X : npt.ArrayLike
             Features, of shape `(n_samples, n_features)`
-        y : Optional[npt.ArrayLike], optional
+        y : npt.ArrayLike, optional
             Optional targets, of shape `(n_samples, n_targets)`, by default None
 
         Returns
@@ -288,7 +288,7 @@ def get_results(self,
 
         Parameters
         ----------
-        evaluation : Optional[ScoreEvaluation], by default None
+        evaluation : ScoreEvaluation, optional 
             ScoreEvaluation dictionary to convert. Otherwise extract this object's stored scores.
 
         Returns
@@ -469,16 +469,16 @@ def __init__(
 
         Parameters
         ----------
-        feature_grids: (optional) Dict[str, npt.ArrayLike], by default None
+        feature_grids: Dict[str, npt.ArrayLike], optional
             Map from feature_name to grid of values for that feature.
             If set, dependence will only be computed for specified features.
 
         conditional_filter:
-        (optional) Callable[[npt.ArrayLike, npt.ArrayLike], Tuple[npt.ArrayLike, npt.ArrayLike]]
+        Callable[[npt.ArrayLike, npt.ArrayLike], Tuple[npt.ArrayLike, npt.ArrayLike]], optional
             Used to filter X and y before computing dependence
             Takes X,y, produces new X,y
             by default None: no filter
-        end_transform_indx: (optional) int
+        end_transform_indx: int, optional
             compute dependence with respect to this point of the pipeline onwards.
             TODO dive deep, write example
         """
@@ -542,7 +542,7 @@ def evaluate(self,
             An sklearn estimator
         X : npt.ArrayLike
             Features, of shape `(n_samples, n_features)`
-        y : Optional[npt.ArrayLike]
+        y : npt.ArrayLike, optional
             targets, of shape `(n_samples, n_targets)`, by default None
 
         Returns
@@ -614,7 +614,7 @@ def get_results(
             _description_, by default "black"
         color_samples: str, optional
             _description_, by default "grey"
-        pd_alpha: _type_, optional
+        pd_alpha: float, optional
             _description_, by default None
         ci_bounds: tuple, optional
             _description_, by default (0.025, 0.975)
@@ -702,20 +702,20 @@ class PermutationImportanceEvaluator(Evaluator):
     n_top: int
         The number of features to show on the plot
 
-    features: (optional) [int] or [str] or {str:[int]}
+    features: [int] or [str] or {str:[int]}, optional
         A list of features, either indices or column names for which importance
         should be computed. Defaults to computing importance for all features.
 
-    end_transform_indx: (optional) int
+    end_transform_indx: int, optional
         Set if you which to compute feature importance with respect to features
         after this point in the pipeline. Defaults to computing importance with
         respect to the whole pipeline.
 
-    grouped: bool (default=False)
+    grouped: bool, optional
         Should features be permuted together as groups. If True, features must
         be passed as a dictionary.
 
-    conditional_filter: (optional) callable
+    conditional_filter: callable, optional
         Used to filter X and y before computing importance
     """
 
@@ -768,7 +768,7 @@ def prepare(self,
             The estimator that will be evaluated
         X : npt.ArrayLike
             Training feature data
-        y : Optional[npt.ArrayLike], optional
+        y : npt.ArrayLike, optional
             Training target data, by default None
         random_state : RandomStateType, optional
             Random state, by default None
@@ -832,7 +832,7 @@ def evaluate(self,
 
         Returns
         -------
-        List[`~sklearn.utils.Bunch`]
+        List[:class:`~sklearn.utils.Bunch`]
             A singleton list containing a Bunch that holds the permutation
             importance for each feature.
         """
@@ -885,12 +885,12 @@ def aggregate(self, evaluations: Sequence[List[Bunch]]) -> List[Bunch]:
 
         Parameters
         ----------
-        evaluations: Sequence[List[`~sklearn.utils.Bunch`]]
+        evaluations: Sequence[List[:class:`~sklearn.utils.Bunch`]]
             Sequence of evaluations to concatenate
 
         Returns
         -------
-        List[`~sklearn.utils.Bunch`]
+        List[:class:`~sklearn.utils.Bunch`]
             Concatenated evaluations
         """
         return _flatten(evaluations)
diff --git a/cinspect/model_evaluation.py b/cinspect/model_evaluation.py
index 4957abc..177e8db 100644
--- a/cinspect/model_evaluation.py
+++ b/cinspect/model_evaluation.py
@@ -2,6 +2,9 @@
 # Licensed under the Apache 2.0 License.
 """Causal model evaluation functions."""
 
+# defers evaluation of annotations so sphinx can parse type aliases rather than
+# their expanded forms
+from __future__ import annotations
 
 import inspect
 import logging
@@ -130,9 +133,9 @@ def bootstrap_model(
         The number of bootstrap replications, by default 100
     subsample : float, optional
         Approximate proportion of the data to sample, by default 1.0
-    stratify : Optional[Union[pd.Series, np.ndarray]], optional
+    stratify : Union[pd.Series, np.ndarray], optional
         The stratification variable, by default None
-    random_state : Optional[Union[int, np.random.RandomState]], optional
+    random_state : Union[int, np.random.RandomState], optional
         The random state, by default None
     use_group_cv : bool, optional
         If true, the function inputs the indices of the re-sampled datasets into the estimator
@@ -219,7 +222,7 @@ def bootcross_model(
         The approximate proportion (float in (0-1))
         or count (int in [1,n])
         of the data to be used for the test set, by default 0.25
-    random_state : Optional[Union[int, np.random.RandomState]], optional
+    random_state : Union[int, np.random.RandomState], optional
         The random state, by default None
     use_group_cv : bool, optional
         If true, the function inputs the indices of the re-sampled datasets into the estimator
@@ -376,7 +379,7 @@ def _repeatedly_evaluate_model(
         Evaluators to use
     use_group_cv : bool, optional
         Whether to use group cross validation, by default False
-    random_state : Optional[Union[int, np.random.RandomState]], optional
+    random_state : Union[int, np.random.RandomState], optional
         Random state, by default None
     n_jobs : int, optional
         Number of jobs to use, using `joblib.Parallel` by default 1
diff --git a/cinspect/stats.py b/cinspect/stats.py
index d1d7a1c..1868654 100644
--- a/cinspect/stats.py
+++ b/cinspect/stats.py
@@ -35,13 +35,13 @@ def conditional_cov(X, Y, estimator=None, bias=False, ddof=None):
         A two-dimensional (n, p) array of conditioning variables.
     Y: ndarray, DataFrame
         A two-dimensional (n, d) array of variables.
-    estimator: optional, scikit learn multiple output regression estimator
+    estimator: scikit learn multiple output regression estimator, optional
         A multiple output regression estimator. By default this is a
         LinearRegression estimator. This is to compute the relationship
         E[Y|X] for the conditional covariance.
     bias: bool
         How to normalise the covariance matrix. See numpy.cov for more details.
-    ddof: optional, int
+    ddof: int, optional
         The degrees of freedom to use for normalisation. See numpy.cov for more
         details.
 
diff --git a/cinspect/utils.py b/cinspect/utils.py
index bc73b9a..e8ee2c0 100644
--- a/cinspect/utils.py
+++ b/cinspect/utils.py
@@ -1,10 +1,11 @@
 # Copyright (c) Gradient Institute. All rights reserved.
 # Licensed under the Apache 2.0 License.
 """Common handy functions."""
+from functools import singledispatch
+
 import numpy as np
 import pandas as pd
 from multimethod import multimethod
-from functools import singledispatch
 
 
 @multimethod
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c4ad650..8ce05fd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -33,6 +33,10 @@
 
 autoclass_content = 'class'  # 'both': concatenate, display *both* the class and __init__ docstrings
 autodoc_typehints = "both" # make explicit that typehints are shown in the signature, rather than the description
+# source files that import annotations from __future__ can use un-expanded type aliases listed below
+# See https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#confval-autodoc_type_aliases
+autodoc_type_aliases = {"npt.ArrayLike": "npt.ArrayLike", }
+
 # -- Options for sphinx.ext.extlinks
 
 # use :issue:`123` to link to project issue on GitHub
diff --git a/simulations/datagen.py b/simulations/datagen.py
index e129eb7..fe84993 100644
--- a/simulations/datagen.py
+++ b/simulations/datagen.py
@@ -253,9 +253,11 @@ def cate(
         """Compute the estimated Conditional Average Treatment Effect from a sample size n.
         
         Multiple condition values can be passed and the CATE will be computed for each.
+
         Parameters
         ----------
         n: int
+            The number of samples to draw for the estimate
         treatment_node: str
             The name of the treatment node
         outcome_node: str
@@ -271,7 +273,7 @@ def cate(
 
         Returns
         -------
-        cate: np.ndarray
+        cate: :class:`np.ndarray`
             The estimated Conditional Average Treatment Effect for each condition value.
             Shape (len(condition_values),)
 

From 6ae76645618ee568cf0f996727225d893c1f0df2 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Fri, 12 May 2023 14:52:51 +1000
Subject: [PATCH 16/20] Lint `evaluators.py`

---
 cinspect/evaluators.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cinspect/evaluators.py b/cinspect/evaluators.py
index 8da57ce..d7cf439 100644
--- a/cinspect/evaluators.py
+++ b/cinspect/evaluators.py
@@ -48,8 +48,6 @@ class Evaluator:
     Mostly... Evaluator holds metadata for its Evaluation.
     Liskov substitution principle suggests that subtypes should be swappable;
     this is not currently true because we can't enforce the behaviour of the objects' consumers
-
-    
     """
 
     Evaluation = TypeVar("Evaluation")
@@ -288,7 +286,7 @@ def get_results(self,
 
         Parameters
         ----------
-        evaluation : ScoreEvaluation, optional 
+        evaluation : ScoreEvaluation, optional
             ScoreEvaluation dictionary to convert. Otherwise extract this object's stored scores.
 
         Returns

From 26292147b708d9d0dd914e6e591cebaf55604b0d Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Tue, 16 May 2023 14:21:12 +1000
Subject: [PATCH 17/20] Export simulations as package in `setup.py`

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 5b94fcd..c29c9f1 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,7 @@
     keywords="causality inspection interpretability",
     # You can just specify the packages manually here if your project is
     # simple. Or you can use find_packages().
-    packages=["cinspect"],
+    packages=["cinspect", "simulations"],
     # List run-time dependencies here.  These will be installed by pip when
     # your project is installed. For an analysis of "install_requires" vs pip"s
     # requirements files see:

From 51b78144beb815befe518d35c297062ec219758c Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Tue, 16 May 2023 17:19:52 +1000
Subject: [PATCH 18/20] Change README to reStructuredText

---
 README.md => README.rst | 79 ++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 36 deletions(-)
 rename README.md => README.rst (67%)

diff --git a/README.md b/README.rst
similarity index 67%
rename from README.md
rename to README.rst
index 4568c4c..00c1407 100644
--- a/README.md
+++ b/README.rst
@@ -3,10 +3,12 @@ Causal Inspection
 
 A Scikit-learn inspired inspection module for *causal models*.
 
-<img src="pd_examples.png" alt="Example partial dependence plots">
+.. image:: https://github.com/gradientinstitute/causal-inspection/blob/main/pd_examples.png
+    :alt: Example partial dependence plots
+
 Plots generated using this library, these are an example of how partial 
 dependence plots can be used for visualising causal effect, see [3] for 
-more details.<br><br>
+more details.
 
 Using machine learning for (observational) causal inference is distinct from
 how machine learning is used for prediction. Typically a process like the
@@ -31,8 +33,7 @@ plotting for continuous and discrete treatment effects [1, 2], as well as
 methods for estimating binary and categorical treatment effects.
 
 We have implemented (some) of the visualisation and quantification methods
-discussed in [1] and [2]. Please see the [Example
-Usage](https://github.com/gradientinstitute/causal-inspection#example-usage)
+discussed in [1] and [2]. Please see the `Example Usage`_
 section for more details.
 
 
@@ -42,15 +43,19 @@ Installation
 To just install the cinspect package, clone it from github and then in the
 cloned directory,
 
+::
+
     pip install .
 
 To also install the extra packages required for development and simulation,
 install in the following way,
 
+::
+
     pip install -e .[dev]
 
-You may have to escape some of the characters in this command, e.g. `pip
-install -e .\[dev\]`. You can then run the simulations in the `simulations`
+You may have to escape some of the characters in this command, e.g. ``pip
+install -e .\[dev\]``. You can then run the simulations in the ``simulations``
 directory.
 
 
@@ -65,9 +70,9 @@ Modules
 Example Usage
 -------------
 
-We strive for an interface that is familiar to those who use scikit-learn. In
-particular we have emulated the interface to the
-[`cross_validate`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)
+We strive for an interface that is familiar to those who use `scikit-learn <https://scikit-learn.org/>`_.
+In particular we have emulated the interface to the
+`cross_validate <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html>`_
 function.
 
 The advantage of this interface is that you can use scikit-learn pipeline
@@ -79,40 +84,42 @@ partial dependence plots with confidence intervals, and permutation importance
 plots.
 
 
-```python
-import matplotlib.pyplot as plt
+.. code:: python
+    
+    import matplotlib.pyplot as plt
+
+    from sklearn.ensemble import GradientBoostingRegressor
+    from sklearn.model_selection import GridSearchCV
+    from cinspect import (bootstrap_model, PartialDependanceEvaluator,
+        PermutationImportanceEvaluator)
+
+    # X is a pandas dataframe with a column labelled "T" for treatment
+    # ...
 
-from sklearn.ensemble import GradientBoostingRegressor
-from sklearn.model_selection import GridSearchCV
-from cinspect import (bootstrap_model, PartialDependanceEvaluator,
-    PermutationImportanceEvaluator)
+    # Model, with built in model selection
+    model = GridSearchCV(
+        GradientBoostingRegressor(),
+        param_grid={"max_depth": [1, 2, 3]}
+    )
 
-# X is a pandas dataframe with a column labelled "T" for treatment
-# ...
+    # Casual estimation - partial dependence and permutation importance
+    pdeval = PartialDependanceEvaluator(feature_grids={"T": "auto"})
+    pieval = PermutationImportanceEvaluator(n_repeats=5)
 
-# Model, with built in model selection
-model = GridSearchCV(
-    GradientBoostingRegressor(),
-    param_grid={"max_depth": [1, 2, 3]}
-)
+    # Bootstrap sample the data, re-fitting and re-evaluating the model each time.
+    # This will run the GridSearchCV estimator, so thereby performing model
+    # selection within each bootstrap sample.
+    # n_jobs=-1 parallelises the bootstrapping to use all cores.
+    bootstrap_model(best_model, X, Y, [pdeval, pieval], replications=30, n_jobs=-1)
 
-# Casual estimation - partial dependence and permutation importance
-pdeval = PartialDependanceEvaluator(feature_grids={"T": "auto"})
-pieval = PermutationImportanceEvaluator(n_repeats=5)
+    # Plot results
+    pdeval.get_results(mode="interval")  # PD plot with confidence intervals
+    pdeval.get_results(mode="derivative")  # Derivative PD plots, see [2]
+    pieval.get_results(ntop=5)  # Permutation importance, show top 5 features
 
-# Bootstrap sample the data, re-fitting and re-evaluating the model each time.
-# This will run the GridSearchCV estimator, so thereby performing model
-# selection within each bootstrap sample.
-# n_jobs=-1 parallelises the bootstrapping to use all cores.
-bootstrap_model(best_model, X, Y, [pdeval, pieval], replications=30, n_jobs=-1)
+    plt.show()
 
-# Plot results
-pdeval.get_results(mode="interval")  # PD plot with confidence intervals
-pdeval.get_results(mode="derivative")  # Derivative PD plots, see [2]
-pieval.get_results(ntop=5)  # Permutation importance, show top 5 features
 
-plt.show()
-```
 
 See `simulations/simple_sim.py` for a slightly more complex version where we
 integrate model selection within the bootstrap sampling procedure.

From f073f5016c8802b4268af237e67f7697379eef00 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Tue, 16 May 2023 17:22:29 +1000
Subject: [PATCH 19/20] Fix `setup.py` reference to `README`

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c29c9f1..a5d2210 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 here = path.abspath(path.dirname(__file__))
 
 # Get the long description from the README file
-with open(path.join(here, "README.md"), encoding="utf-8") as f:
+with open(path.join(here, "README.rst"), encoding="utf-8") as f:
     long_description = f.read()
 
 setup(

From f43be67ce48400970c6e24f2493ef18d63a42cb1 Mon Sep 17 00:00:00 2001
From: Jack Fitzgerald Sice <12495594+jack-fs@users.noreply.github.com>
Date: Wed, 17 May 2023 12:10:56 +1000
Subject: [PATCH 20/20] Update `crossval_model` docs

---
 cinspect/model_evaluation.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cinspect/model_evaluation.py b/cinspect/model_evaluation.py
index 177e8db..2c78956 100644
--- a/cinspect/model_evaluation.py
+++ b/cinspect/model_evaluation.py
@@ -59,7 +59,9 @@ def crossval_model(
     evaluators : Sequence[Evaluator]
         A list of evaluators.
     cv : Union[int, BaseCrossValidator], optional
-        The cross validation strategy, by default KFold(n_splits=5)
+        The cross validation strategy, by default KFold(n_splits=5).
+        Passing an integer will use KFold with that number of splits,
+        like `sklearn.model_selection.cross_validate`.
     random_state : Union[int, np.random.RandomState], optional
         The random state, by default None
     stratify : Union[np.ndarray, pd.Series], optional
@@ -75,6 +77,9 @@ def crossval_model(
     # Run various checks and prepare the evaluators
     random_state = check_random_state(random_state)
 
+    # TODO: use sklearn.model_selection.check_cv instead
+    # this will directly mimic sklearn.model_selection.cross_validate
+    # but changes behaviour (uses StratifiedKFold) if the estimator is a classifier
     if isinstance(cv, int):
         cv = KFold(n_splits=cv, shuffle=True, random_state=random_state)