callaway-santanna: make per-(g,t) analytical SE cluster-aware

igerber · claude · igerber · commit 2b166cb0e9d4 · 2026-05-23T06:58:39.000-04:00
CI codex R3 P0: the cluster wiring contract documented in REGISTRY.md
("cluster=X means CR1 Liang-Zeger on the IF") was honored at the
aggregate inference surface (overall_se, event-study, group, bootstrap)
but the per-cell public surface results.group_time_effects[(g,t)]["se"]
remained unit-level. Users inspecting per-cell ATT(g,t) inference under
cluster= got silently misleading SE/t/p/CI even though overall inference
was correctly cluster-robust.

Fix: new module-level _cluster_robust_se_from_per_gt_if helper that
aggregates the per-(g,t) IF by PSU and returns CR1 Liang-Zeger SE.
Applied at all 4 ATT(g,t) computation sites identified by the codex:

1. _compute_all_att_gt_vectorized (no-covariate vectorized batch) —
   recompute se after building inf_info, overwrite group_time_effects
   [(g,t)]["se"] which was set with the unit-level value
2. _compute_all_att_gt_covariate_reg (covariate-reg batch) — same pattern
3. Main panel single-cell loop (after _compute_att_gt_fast) — local
   se_gt update flows into gte_entry["se"]
4. RC fit loop (after _compute_att_gt_rc) — uses resolved_survey.psu
   (per-obs) instead of resolved_survey_unit.psu (per-unit)

The recompute is gated by `if psu is not None`, so cluster=None remains
bit-equal to pre-PR. For cluster=unit (each unit its own cluster), the
CR1 formula coincides with the unit-level IF formula (modulo ddof
conventions in the underlying OR path) — methodologically consistent
with Williams (2000) CR1-on-IF for IF-based estimators.

Tests:
- test_per_gt_analytical_se_changes_with_cluster: asserts at least one
  (g,t) cell shows measurable SE divergence between cluster=None and
  cluster="state" on a panel with intra-cluster correlation
- test_per_gt_se_matches_explicit_survey_design: asserts per-(g,t) SE
  agrees (rel=1e-10) between bare cluster="state" and explicit
  SurveyDesign(psu="state") — both activate the same CR1 aggregation

All 414 tests (test_staggered + test_staggered_rc + test_triple_diff +
test_honest_did + test_two_stage) pass.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/staggered.py b/diff_diff/staggered.py
@@ -92,6 +92,65 @@ def _linear_regression(
     return beta, residuals
 
 
+def _cluster_robust_se_from_per_gt_if(
+    inf_info: Dict[str, Any],
+    psu_array: np.ndarray,
+) -> Optional[float]:
+    """CR1 Liang-Zeger cluster-robust SE for a single (g,t) ATT.
+
+    Computes the cluster-aggregated IF variance for one per-(g,t) cell:
+
+        psi_per_index[i] = sum of IFs at this index for this (g, t)
+        psi_per_cluster[c] = sum_{i in c} psi_per_index[i]
+        se = sqrt(sum(psi_per_cluster ** 2))
+
+    For the panel path, ``psu_array`` is ``resolved_survey_unit.psu``
+    (length n_units) and the IF index space is per-unit. For the RCS
+    path, ``psu_array`` is ``resolved_survey.psu`` (length n_obs) and
+    the IF index space is per-obs. The helper is index-space agnostic
+    — it just requires ``treated_idx`` / ``control_idx`` in ``inf_info``
+    to be valid offsets into ``psu_array``.
+
+    Returns ``None`` when ``inf_info`` lacks the required IF fields or
+    when index alignment cannot be verified (caller falls back to the
+    unit-level SE returned by the underlying estimation method).
+    """
+    if (
+        inf_info is None
+        or "treated_inf" not in inf_info
+        or "control_inf" not in inf_info
+        or "treated_idx" not in inf_info
+        or "control_idx" not in inf_info
+    ):
+        return None
+    treated_idx = np.asarray(inf_info["treated_idx"])
+    control_idx = np.asarray(inf_info["control_idx"])
+    treated_inf = np.asarray(inf_info["treated_inf"])
+    control_inf = np.asarray(inf_info["control_inf"])
+    n = len(psu_array)
+    if (
+        treated_idx.size > 0
+        and (treated_idx.max(initial=-1) >= n or treated_idx.min(initial=0) < 0)
+    ) or (
+        control_idx.size > 0
+        and (control_idx.max(initial=-1) >= n or control_idx.min(initial=0) < 0)
+    ):
+        return None
+    psi_per_index = np.zeros(n)
+    if treated_idx.size:
+        np.add.at(psi_per_index, treated_idx, treated_inf)
+    if control_idx.size:
+        np.add.at(psi_per_index, control_idx, control_inf)
+    # Factorize PSU labels for index-friendly aggregation
+    _, psu_codes = np.unique(psu_array, return_inverse=True)
+    n_clusters = int(psu_codes.max() + 1) if psu_codes.size else 0
+    if n_clusters == 0:
+        return None
+    psi_per_cluster = np.zeros(n_clusters)
+    np.add.at(psi_per_cluster, psu_codes, psi_per_index)
+    return float(np.sqrt(np.sum(psi_per_cluster**2)))
+
+
 def _safe_inv(
     A: np.ndarray,
     tracker: Optional[list] = None,
@@ -1035,14 +1094,30 @@ def _compute_all_att_gt_vectorized(
             all_units = precomputed["all_units"]
             treated_positions = np.where(treated_valid)[0]
             control_positions = np.where(control_valid)[0]
-            influence_func_info[(g, t)] = {
+            inf_info_gt = {
                 "treated_idx": treated_positions,
                 "control_idx": control_positions,
                 "treated_units": all_units[treated_positions],
                 "control_units": all_units[control_positions],
                 "treated_inf": inf_treated,
                 "control_inf": inf_control,
             }
+            influence_func_info[(g, t)] = inf_info_gt
+
+            # Cluster-aware per-(g,t) SE: aggregate the per-(g,t) IF by
+            # PSU when a survey design (explicit OR synthesized from bare
+            # cluster=) provides one. Bit-equal to pre-PR when psu is None.
+            rsu_for_gt = precomputed.get("resolved_survey_unit")
+            if rsu_for_gt is not None and getattr(rsu_for_gt, "psu", None) is not None:
+                se_cluster = _cluster_robust_se_from_per_gt_if(inf_info_gt, rsu_for_gt.psu)
+                if se_cluster is not None and np.isfinite(se_cluster):
+                    se = se_cluster
+                    # gte_entry["se"] was set with the unit-level value
+                    # at the gte_entry construction above; overwrite with
+                    # the cluster-aware value so the public surface
+                    # group_time_effects[(g,t)]["se"] reflects the
+                    # documented CR1 contract.
+                    group_time_effects[(g, t)]["se"] = se
 
             atts.append(att)
             ses.append(se)
@@ -1379,14 +1454,24 @@ def _compute_all_att_gt_covariate_reg(
                 all_units = precomputed["all_units"]
                 treated_positions = np.where(treated_valid)[0]
                 control_positions = np.where(control_valid)[0]
-                influence_func_info[(g, t)] = {
+                inf_info_gt = {
                     "treated_idx": treated_positions,
                     "control_idx": control_positions,
                     "treated_units": all_units[treated_positions],
                     "control_units": all_units[control_positions],
                     "treated_inf": inf_treated,
                     "control_inf": inf_control,
                 }
+                influence_func_info[(g, t)] = inf_info_gt
+
+                # Cluster-aware per-(g,t) SE — see same pattern in
+                # _compute_all_att_gt_vectorized.
+                rsu_for_gt = precomputed.get("resolved_survey_unit")
+                if rsu_for_gt is not None and getattr(rsu_for_gt, "psu", None) is not None:
+                    se_cluster = _cluster_robust_se_from_per_gt_if(inf_info_gt, rsu_for_gt.psu)
+                    if se_cluster is not None and np.isfinite(se_cluster):
+                        se = se_cluster
+                        group_time_effects[(g, t)]["se"] = se
 
                 atts.append(att)
                 ses.append(se)
@@ -1820,6 +1905,22 @@ def fit(
                     agg_w = rc_result[6] if len(rc_result) > 6 else n_treat
 
                     if att_gt is not None:
+                        # Cluster-aware per-(g,t) SE on the RCS path. RC
+                        # IF indices are per-obs (vs per-unit on the panel
+                        # path); the corresponding PSU array is
+                        # ``resolved_survey.psu`` (length n_obs), not
+                        # ``resolved_survey_unit.psu``. Bit-equal to pre-PR
+                        # when psu is None.
+                        rs_for_gt = precomputed.get("resolved_survey") if precomputed else None
+                        if (
+                            rs_for_gt is not None
+                            and getattr(rs_for_gt, "psu", None) is not None
+                            and inf_info is not None
+                        ):
+                            se_cluster = _cluster_robust_se_from_per_gt_if(inf_info, rs_for_gt.psu)
+                            if se_cluster is not None and np.isfinite(se_cluster):
+                                se_gt = se_cluster
+
                         t_stat, p_val, ci = safe_inference(
                             att_gt,
                             se_gt,
@@ -1912,6 +2013,22 @@ def fit(
                     )
 
                     if att_gt is not None:
+                        # Cluster-aware per-(g,t) SE: when a survey PSU is
+                        # in play (explicit OR synthesized from bare
+                        # cluster=), aggregate the per-(g,t) IF by PSU
+                        # and use CR1 Liang-Zeger SE instead of the
+                        # unit-level diff-of-means SE returned by OR/IPW/DR.
+                        # Preserves bit-equality when psu is None.
+                        rsu_for_gt = precomputed.get("resolved_survey_unit")
+                        if (
+                            rsu_for_gt is not None
+                            and getattr(rsu_for_gt, "psu", None) is not None
+                            and inf_info is not None
+                        ):
+                            se_cluster = _cluster_robust_se_from_per_gt_if(inf_info, rsu_for_gt.psu)
+                            if se_cluster is not None and np.isfinite(se_cluster):
+                                se_gt = se_cluster
+
                         t_stat, p_val, ci = safe_inference(
                             att_gt,
                             se_gt,
diff --git a/tests/test_staggered.py b/tests/test_staggered.py
@@ -4829,6 +4829,98 @@ def test_bare_cluster_bootstrap_se_differs_from_unit_level(self):
             "not be reaching the bootstrap multiplier-weights routing."
         )
 
+    def test_per_gt_analytical_se_changes_with_cluster(self):
+        """Per-(g,t) analytical SE at results.group_time_effects[(g,t)]
+        ["se"] must change when cluster= is set (mirrors the overall_se
+        contract). Pre-fix, per-(g,t) SEs were unit-level even with
+        cluster=, only the aggregate path + bootstrap honored cluster=.
+        Per CI codex R3 P0 finding."""
+        data = _generate_clustered_staggered_data(seed=97)
+
+        cs_unit = CallawaySantAnna()
+        res_unit = cs_unit.fit(
+            data,
+            outcome="outcome",
+            unit="unit",
+            time="time",
+            first_treat="first_treat",
+        )
+        cs_cluster = CallawaySantAnna(cluster="state")
+        res_cluster = cs_cluster.fit(
+            data,
+            outcome="outcome",
+            unit="unit",
+            time="time",
+            first_treat="first_treat",
+        )
+
+        # Pick a representative (g, t) cell that exists in both fits
+        gt_keys = sorted(
+            set(res_unit.group_time_effects.keys()) & set(res_cluster.group_time_effects.keys())
+        )
+        assert len(gt_keys) > 0, "expected overlapping (g, t) keys"
+
+        # At least one (g, t) cell must show measurable SE divergence —
+        # cluster-aware aggregation should differ from unit-level for at
+        # least one cell on a panel with intra-cluster correlation.
+        diffs = []
+        for gt in gt_keys:
+            se_unit = res_unit.group_time_effects[gt]["se"]
+            se_cluster = res_cluster.group_time_effects[gt]["se"]
+            if np.isfinite(se_unit) and np.isfinite(se_cluster):
+                diffs.append(abs(se_unit - se_cluster))
+        max_diff = max(diffs) if diffs else 0.0
+        assert max_diff > 1e-6, (
+            f"Per-(g,t) SEs did not change with cluster= (max diff "
+            f"across {len(diffs)} cells: {max_diff:.6g}). The cluster= "
+            "parameter may not be reaching the per-(g,t) analytical SE "
+            "computation."
+        )
+
+    def test_per_gt_se_matches_explicit_survey_design(self):
+        """When bare cluster=X and explicit SurveyDesign(psu=X) produce
+        equivalent variance contracts, the per-(g,t) SE surface must
+        also agree (modulo the deterministic synthesis path). Per CI
+        codex R3 P0 finding."""
+        from diff_diff import SurveyDesign
+
+        data = _generate_clustered_staggered_data(seed=101)
+
+        cs_bare = CallawaySantAnna(cluster="state")
+        res_bare = cs_bare.fit(
+            data,
+            outcome="outcome",
+            unit="unit",
+            time="time",
+            first_treat="first_treat",
+        )
+
+        cs_explicit = CallawaySantAnna()
+        res_explicit = cs_explicit.fit(
+            data,
+            outcome="outcome",
+            unit="unit",
+            time="time",
+            first_treat="first_treat",
+            survey_design=SurveyDesign(psu="state"),
+        )
+
+        gt_keys = sorted(
+            set(res_bare.group_time_effects.keys()) & set(res_explicit.group_time_effects.keys())
+        )
+        assert len(gt_keys) > 0
+
+        for gt in gt_keys:
+            se_bare = res_bare.group_time_effects[gt]["se"]
+            se_explicit = res_explicit.group_time_effects[gt]["se"]
+            if np.isfinite(se_bare) and np.isfinite(se_explicit):
+                assert se_bare == pytest.approx(se_explicit, rel=1e-10, abs=1e-12), (
+                    f"Per-(g,t) SE divergence at {gt}: bare cluster=state "
+                    f"({se_bare}) vs explicit SurveyDesign(psu=state) "
+                    f"({se_explicit}). Both should activate the same CR1 "
+                    "aggregation."
+                )
+
     def test_survey_design_psu_wins_under_bootstrap(self):
         """Bootstrap path: when survey_design=SurveyDesign(psu=Y) is
         explicit AND cluster=X is also set with a different partition,