diff --git a/.github/workflows/autoformat.yml b/.github/workflows/autoformat.yml index 41247db..1fe667e 100644 --- a/.github/workflows/autoformat.yml +++ b/.github/workflows/autoformat.yml @@ -2,7 +2,7 @@ name: Auto-format Code on: push: - branches: [main, develop] + branches: [main, devel*] workflow_dispatch: jobs: diff --git a/docs/vignettes/more_advanced_models.md b/docs/vignettes/more_advanced_models.md index ed9fd5a..d31bd06 100644 --- a/docs/vignettes/more_advanced_models.md +++ b/docs/vignettes/more_advanced_models.md @@ -81,6 +81,33 @@ my_output = my_analysis.collect() my_output.to_md() ``` +## Modelling Followup with a Natural Cubic Spline + +By default, followup time enters the outcome model as a linear and quadratic term (`followup` and `followup_sq`). If you prefer a more flexible non-linear representation, you can replace these with a natural cubic spline basis by setting `followup_spline=True`. Because the spline replaces the standard followup terms, you must also set `followup_include=False` to avoid conflicting specifications. + +```python +my_options = SEQopts( + followup_spline=True, + followup_include=False, + km_curves=True, +) +``` + +The spline basis is constructed using patsy's `cr()` function. Knot positions are computed once from the full expanded dataset before any bootstrap resampling, and are held fixed across all bootstrap iterations and the survival prediction grid. This ensures the spline basis is identical at fit time and prediction time regardless of how the bootstrap sample happens to be distributed. + +The degrees of freedom for the spline are controlled by `followup_spline_df`, which defaults to `4` (matching the default in the R package SEQTaRget). Increasing this allows a more flexible curve; decreasing it towards the minimum of `2` produces a stiffer fit. + +```python +my_options = SEQopts( + followup_spline=True, + followup_include=False, + followup_spline_df=6, # more flexible spline + km_curves=True, +) +``` + +The rest of the analytical pipeline is unchanged — `expand()`, `fit()`, `survival()`, and `collect()` all work exactly as before. + ## That's it? Yes! There are very few differences between the code for more straightforward and more difficult analyses using this package. Our hope is that through utilizing almost only the SEQopts to work with your analysis, that this is a streamlined process that is also easy to manipulate. diff --git a/pySEQTarget/SEQopts.py b/pySEQTarget/SEQopts.py index 5cd32ae..c4fc496 100644 --- a/pySEQTarget/SEQopts.py +++ b/pySEQTarget/SEQopts.py @@ -45,6 +45,8 @@ class SEQopts: :type followup_include: bool :param followup_spline: Boolean to force followup values to be fit to cubic spline :type followup_spline: bool + :param followup_spline_df: Degrees of freedom for the followup cubic spline, default ``4`` + :type followup_spline_df: int :param followup_max: Maximum allowed followup in analysis :type followup_max: int or None :param followup_min: Minimum allowed followup in analysis @@ -130,6 +132,7 @@ class SEQopts: followup_max: int = None followup_min: int = 0 followup_spline: bool = False + followup_spline_df: int = 4 hazard_estimate: bool = False indicator_baseline: str = "_bas" indicator_squared: str = "_sq" diff --git a/pySEQTarget/analysis/_outcome_fit.py b/pySEQTarget/analysis/_outcome_fit.py index 1bfc045..a0fdda6 100644 --- a/pySEQTarget/analysis/_outcome_fit.py +++ b/pySEQTarget/analysis/_outcome_fit.py @@ -1,12 +1,30 @@ import re +import numpy as np import polars as pl import statsmodels.api as sm import statsmodels.formula.api as smf -def _apply_spline_formula(formula, indicator_squared): - spline = "cr(followup, df=3)" +def _compute_spline_knots(followup_arr, df=3): + lower = float(np.min(followup_arr)) + upper = float(np.max(followup_arr)) + n_inner = df - 2 + if n_inner == 0: + inner_knots = [] + else: + # Replicate patsy's knot placement: percentiles of unique values in [lower, upper] + x = np.unique(followup_arr[(lower <= followup_arr) & (followup_arr <= upper)]) + q = np.linspace(0, 100, n_inner + 2)[1:-1] + inner_knots = np.percentile(x, q.tolist()).tolist() + return inner_knots, lower, upper + + +def _apply_spline_formula(formula, indicator_squared, spline_knots): + inner_knots, lower, upper = spline_knots + spline = ( + f"cr(followup, knots={inner_knots}, lower_bound={lower}, upper_bound={upper})" + ) formula = re.sub(r"(\w+)\s*\*\s*followup\b", rf"\1*{spline}", formula) formula = re.sub(r"\bfollowup\s*\*\s*(\w+)", rf"{spline}*\1", formula) @@ -18,8 +36,8 @@ def _apply_spline_formula(formula, indicator_squared): formula = re.sub(r"^\s*\+\s*|\s*\+\s*$", "", formula).strip() if formula: - return f"{formula} + I({spline}**2)" - return f"I({spline}**2)" + return f"{formula} + {spline}" + return spline def _cast_categories(self, df_pd): @@ -64,7 +82,13 @@ def _outcome_fit( df_pd = _cast_categories(self, df.to_pandas()) if self.followup_spline: - formula = _apply_spline_formula(formula, self.indicator_squared) + if getattr(self, "_current_boot_idx", None) is None: + self._followup_spline_knots = _compute_spline_knots( + self.DT["followup"].to_numpy(), df=self.followup_spline_df + ) + formula = _apply_spline_formula( + formula, self.indicator_squared, self._followup_spline_knots + ) full_formula = f"{outcome} ~ {formula}" diff --git a/pySEQTarget/error/_param_checker.py b/pySEQTarget/error/_param_checker.py index a926569..66492cb 100644 --- a/pySEQTarget/error/_param_checker.py +++ b/pySEQTarget/error/_param_checker.py @@ -44,6 +44,9 @@ def _param_checker(self): "Only one of followup_class or followup_include can be set to True." ) + if self.followup_spline_df < 2: + raise ValueError("followup_spline_df must be at least 2.") + if ( self.weighted and self.method == "ITT" diff --git a/pyproject.toml b/pyproject.toml index de87979..73035e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pySEQTarget" -version = "0.13.2" +version = "0.13.3" description = "Sequentially Nested Target Trial Emulation" readme = "README.md" license = {text = "MIT"} diff --git a/tests/test_followup_options.py b/tests/test_followup_options.py index 74d5451..c2005d5 100644 --- a/tests/test_followup_options.py +++ b/tests/test_followup_options.py @@ -57,17 +57,18 @@ def test_followup_spline(): s.fit() matrix = s.outcome_model[0]["outcome"].summary2().tables[1]["Coef."].to_list() expected = [ - -6.264817962084417, - 0.20125056343026881, - 0.12568743032952776, - 0.03823426390103046, - 0.0006607691746414019, - 0.003343365539743267, - -0.01319460158923785, - 0.19601796921732118, - -0.5186462478511427, - 0.37598656666756103, - 1.6553848469346044, + -4.804282252748607, + 0.19115933860001255, + 0.12717121164606823, + 0.044310717515918724, + 0.0005814999431447507, + 0.0032948355025455216, + -0.013371824500839971, + 0.19972467861548412, + -2.027245615586753, + -1.395729861856384, + -0.9397731941281695, + -0.4415335811772879, ] assert [round(x, 3) for x in matrix] == [round(x, 3) for x in expected]