From f41d0b67cc8c66e9243e35357a3dfcb5a49a4a47 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 08:27:01 -0400
Subject: [PATCH 01/11] Add replicate-weight variance and PSU-level bootstrap
 to dCDH

Extends the group-level Taylor Series Linearization survey support
landed in PR #307 with two opt-in variance mechanisms:

1. Replicate-weight variance (BRR / Fay / JK1 / JKn / SDR) routed
   through the unified compute_replicate_if_variance helper at every
   IF site (overall DID_M, joiners DID_+, leavers DID_-, multi-horizon
   DID_l, placebo DID^pl_l, heterogeneity beta^het_l, twowayfeweights
   helper). Uses inline-branch pattern mirroring EfficientDiD
   (efficient_did.py:1119-1142) and TripleDifference
   (triple_diff.py:1206-1238). Effective df_survey reduces to
   min(n_valid) - 1 across IF sites when replicates fail, matching
   the precedent in efficient_did.py:1133-1135 and
   triple_diff.py:676-686.

2. PSU-level Hall-Mammen wild bootstrap via group_to_psu_map threaded
   through _compute_dcdh_bootstrap. Under auto-inject psu=group the
   identity-map fast path preserves pre-change behavior bit-for-bit.
   Under an explicitly coarser PSU (e.g., psu=state with county-level
   groups), all groups in the same PSU receive the same bootstrap
   multiplier so within-PSU correlation is preserved.

Locked contracts:
- Replicate weights and n_bootstrap > 0 are mutually exclusive
  (replicate variance is closed-form; bootstrap would double-count).
  Raises NotImplementedError, matching efficient_did.py:989 etc.
- Strata participate only through analytical TSL variance, not the
  bootstrap (multiplier weights stay unit-by-unit).
- Warning fires only when n_psu_eff < n_groups_eff (strictly coarser
  PSU). Under auto-inject psu=group the warning is correctly
  suppressed.

Adds 36 regression tests in tests/test_survey_dcdh_replicate_psu.py
covering Class A sites (via _survey_se_from_group_if), Class B sites
(heterogeneity + twowayfeweights), PSU bootstrap semantics, and
cross-cutting invariants (replicate+bootstrap rejection, HonestDiD
under replicate). REGISTRY.md lines 651-653 updated to document both
mechanisms.

Full suite: 525 passing (was 489 before this change).
---
 diff_diff/chaisemartin_dhaultfoeuille.py      | 310 +++++++--
 .../chaisemartin_dhaultfoeuille_bootstrap.py  | 118 +++-
 diff_diff/guides/llms-full.txt                |   6 +-
 docs/methodology/REGISTRY.md                  |   6 +-
 tests/test_survey_dcdh.py                     |  66 +-
 tests/test_survey_dcdh_replicate_psu.py       | 592 ++++++++++++++++++
 6 files changed, 1012 insertions(+), 86 deletions(-)
 create mode 100644 tests/test_survey_dcdh_replicate_psu.py

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index 9e8b9818..9af4b6c0 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -714,13 +714,15 @@ def fit(
                     f"weight_type='pweight', got '{resolved_survey.weight_type}'. "
                     f"The survey IF variance math assumes probability weights."
                 )
-            if (resolved_survey.replicate_weights is not None
-                    and resolved_survey.replicate_weights.shape[1] > 0):
-                raise NotImplementedError(
-                    "Replicate weight variance for dCDH is not yet supported. "
-                    "Use strata/PSU/FPC for design-based inference via Taylor "
-                    "Series Linearization."
-                )
+            # Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are supported
+            # for analytical variance via compute_replicate_if_variance()
+            # at each IF site (see _survey_se_from_group_if). The combination
+            # of replicate weights and n_bootstrap > 0 is rejected inside
+            # the bootstrap entry block (replicate variance is closed-form;
+            # bootstrap would double-count variance). Matches library
+            # precedent: efficient_did.py:989, staggered.py:1869,
+            # two_stage.py:251-253.
+
             # Within-group-constant PSU/strata is required: the IF
             # expansion psi_i = U[g] * (w_i / W_g) supports within-group
             # variation in WEIGHTS (each obs contributes proportionally),
@@ -767,6 +769,16 @@ def fit(
                 "resolved": resolved_survey,
             }
 
+        # Replicate-weight variance tracker: each _compute_se call under
+        # a replicate design returns n_valid (number of replicate columns
+        # that produced a finite estimate). The effective df_survey is
+        # min(n_valid) - 1 across all IF sites — matches the precedent in
+        # `diff_diff/efficient_did.py:1133-1135` and
+        # `diff_diff/triple_diff.py:676-686`. Under TSL (analytical),
+        # _compute_se returns None for n_valid and df_survey falls through
+        # to resolved_survey.df_survey (= n_psu - n_strata).
+        _replicate_n_valid_list: List[int] = []
+
         # ------------------------------------------------------------------
         # Step 4b: Covariate aggregation (DID^X, Web Appendix Section 1.2)
         # ------------------------------------------------------------------
@@ -1589,16 +1601,18 @@ def fit(
                 U_centered_l = _cohort_recenter(U_l_elig, cid_elig)
                 N_l_h = multi_horizon_dids[l_h]["N_l"]
                 _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]]
-                se_l = _compute_se(
+                se_l, n_valid_l = _compute_se(
                     U_centered=U_centered_l,
                     divisor=N_l_h,
                     obs_survey_info=_obs_survey_info,
                     eligible_groups=_elig_groups_l,
                 )
+                if n_valid_l is not None:
+                    _replicate_n_valid_list.append(n_valid_l)
                 multi_horizon_se[l_h] = se_l
 
                 did_l_val = multi_horizon_dids[l_h]["did_l"]
-                _df_s = resolved_survey.df_survey if resolved_survey is not None else None
+                _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
                 t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_df_s)
                 multi_horizon_inference[l_h] = {
                     "effect": did_l_val,
@@ -1716,15 +1730,17 @@ def fit(
                     cid_elig_pl = cid_pl[eligible_mask_pl]
                     U_centered_pl_l = _cohort_recenter(U_pl_elig, cid_elig_pl)
                     _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]]
-                    se_pl_l = _compute_se(
+                    se_pl_l, n_valid_pl_l = _compute_se(
                         U_centered=U_centered_pl_l,
                         divisor=pl_data["N_pl_l"],
                         obs_survey_info=_obs_survey_info,
                         eligible_groups=_elig_groups_pl,
                     )
+                    if n_valid_pl_l is not None:
+                        _replicate_n_valid_list.append(n_valid_pl_l)
                     placebo_horizon_se[lag_l] = se_pl_l
                     pl_val = pl_data["placebo_l"]
-                    _df_s = resolved_survey.df_survey if resolved_survey is not None else None
+                    _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
                     t_pl_l, p_pl_l, ci_pl_l = safe_inference(
                         pl_val, se_pl_l, alpha=self.alpha, df=_df_s
                     )
@@ -1799,12 +1815,14 @@ def fit(
         )
 
         # Analytical SE for DID_M (survey-aware when survey_design provided)
-        overall_se = _compute_se(
+        overall_se, n_valid_overall = _compute_se(
             U_centered=U_centered_overall,
             divisor=N_S,
             obs_survey_info=_obs_survey_info,
             eligible_groups=_eligible_group_ids,
         )
+        if n_valid_overall is not None:
+            _replicate_n_valid_list.append(n_valid_overall)
         # Detect the degenerate-cohort case: every variance-eligible group
         # forms its own (D_{g,1}, F_g, S_g) cohort, so the centered
         # influence function is identically zero and `_plugin_se` returns
@@ -1829,21 +1847,22 @@ def fit(
                 UserWarning,
                 stacklevel=2,
             )
-        _df_survey = (
-            resolved_survey.df_survey if resolved_survey is not None else None
-        )
+        _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
         overall_t, overall_p, overall_ci = safe_inference(
             overall_att, overall_se, alpha=self.alpha, df=_df_survey
         )
 
         # Joiners SE (uses joiner-only centered IF; conservative bound)
         if joiners_available:
-            joiners_se = _compute_se(
+            joiners_se, n_valid_joiners = _compute_se(
                 U_centered=U_centered_joiners,
                 divisor=joiner_total,
                 obs_survey_info=_obs_survey_info,
                 eligible_groups=_eligible_group_ids,
             )
+            if n_valid_joiners is not None:
+                _replicate_n_valid_list.append(n_valid_joiners)
+            _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
             joiners_t, joiners_p, joiners_ci = safe_inference(
                 joiners_att, joiners_se, alpha=self.alpha, df=_df_survey
             )
@@ -1857,12 +1876,15 @@ def fit(
 
         # Leavers SE
         if leavers_available:
-            leavers_se = _compute_se(
+            leavers_se, n_valid_leavers = _compute_se(
                 U_centered=U_centered_leavers,
                 divisor=leaver_total,
                 obs_survey_info=_obs_survey_info,
                 eligible_groups=_eligible_group_ids,
             )
+            if n_valid_leavers is not None:
+                _replicate_n_valid_list.append(n_valid_leavers)
+            _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
             leavers_t, leavers_p, leavers_ci = safe_inference(
                 leavers_att, leavers_se, alpha=self.alpha, df=_df_survey
             )
@@ -1905,14 +1927,64 @@ def fit(
         bootstrap_results: Optional[DCDHBootstrapResults] = None
         if self.n_bootstrap > 0:
             if resolved_survey is not None:
-                warnings.warn(
-                    "Bootstrap with survey_design uses group-level multiplier "
-                    "weights, not PSU-level. This is conservative when groups "
-                    "are finer than PSUs. PSU-level survey bootstrap is "
-                    "deferred to a future release.",
-                    UserWarning,
-                    stacklevel=2,
-                )
+                # Replicate-weight designs have their own closed-form
+                # variance (Rao-Wu / BRR / Fay / JK1 / JKn / SDR via
+                # compute_replicate_if_variance). Combining replicate
+                # variance with a multiplier bootstrap would double-count
+                # variance. Match library precedent:
+                # efficient_did.py:989, staggered.py:1869,
+                # two_stage.py:251-253.
+                if (
+                    resolved_survey.replicate_weights is not None
+                    and resolved_survey.replicate_weights.shape[1] > 0
+                ):
+                    raise NotImplementedError(
+                        "dCDH survey support rejects the combination of "
+                        "replicate weights and n_bootstrap > 0. Replicate-"
+                        "weight variance is closed-form "
+                        "(compute_replicate_if_variance); use n_bootstrap=0. "
+                        "For a bootstrap-based SE, use strata/PSU/FPC design "
+                        "instead of replicate weights."
+                    )
+
+                # Warning fires only when PSU is strictly coarser than
+                # group. Under auto-inject psu=group (or explicit
+                # psu=<group_col>), groups and PSUs coincide so
+                # Hall-Mammen wild PSU bootstrap equals group-level
+                # multiplier bootstrap — no need to warn.
+                psu_arr_warn = getattr(resolved_survey, "psu", None)
+                if psu_arr_warn is None or _obs_survey_info is None:
+                    # No PSU info — can't compare to group count.
+                    n_psu_eff_warn, n_groups_eff_warn = -1, -1
+                else:
+                    pos_mask_warn = (
+                        np.asarray(
+                            _obs_survey_info["weights"], dtype=np.float64
+                        )
+                        > 0
+                    )
+                    psu_eff = np.asarray(psu_arr_warn)[pos_mask_warn]
+                    gids_eff_warn = np.asarray(
+                        _obs_survey_info["group_ids"]
+                    )[pos_mask_warn]
+                    n_psu_eff_warn = int(len(np.unique(psu_eff)))
+                    n_groups_eff_warn = int(len(np.unique(gids_eff_warn)))
+                if 0 <= n_psu_eff_warn < n_groups_eff_warn:
+                    warnings.warn(
+                        f"Bootstrap with survey_design uses Hall-Mammen "
+                        f"wild multiplier weights at the PSU level "
+                        f"(n_psu={n_psu_eff_warn} PSUs across "
+                        f"n_groups={n_groups_eff_warn} groups). For "
+                        f"designs with substantially unequal PSU sizes, "
+                        f"the wild bootstrap may under-cover relative to "
+                        f"analytical TSL inference; consider "
+                        f"n_bootstrap=0 for the TSL variance. If n_psu "
+                        f"is close to n_groups, lonely-PSU removal "
+                        f"(lonely_psu='remove') may be collapsing "
+                        f"singletons.",
+                        UserWarning,
+                        stacklevel=2,
+                    )
             joiners_inputs = (
                 (U_centered_joiners, joiner_total, joiners_att) if joiners_available else None
             )
@@ -2028,6 +2100,48 @@ def fit(
                         h_data["did_l"],
                     )
 
+            # Under a survey design with PSU information, build a dense
+            # group_to_psu_map so the bootstrap randomizes at the PSU
+            # level rather than at the group level (Hall-Mammen wild PSU
+            # bootstrap). PSU/strata are within-group-constant by the
+            # _validate_group_constant_strata_psu precondition, so each
+            # variance-eligible group has exactly one PSU label. Under
+            # auto-inject psu=group the map is the identity and the
+            # bootstrap mixin's fast path reproduces the pre-PSU
+            # behavior bit-for-bit.
+            group_to_psu_map_bootstrap: Optional[np.ndarray] = None
+            if (
+                resolved_survey is not None
+                and getattr(resolved_survey, "psu", None) is not None
+                and _obs_survey_info is not None
+            ):
+                obs_psu_codes = np.asarray(resolved_survey.psu)
+                obs_gids_boot = np.asarray(_obs_survey_info["group_ids"])
+                obs_weights_boot = np.asarray(
+                    _obs_survey_info["weights"], dtype=np.float64
+                )
+                pos_mask_boot = obs_weights_boot > 0
+                group_psu_labels: List[Any] = []
+                valid_map = True
+                for gid in _eligible_group_ids:
+                    mask_g = (obs_gids_boot == gid) & pos_mask_boot
+                    if not mask_g.any():
+                        valid_map = False
+                        break
+                    labels = obs_psu_codes[mask_g]
+                    # Within-group-constant PSU is validated upstream;
+                    # .item() on the first is robust to integer vs string.
+                    group_psu_labels.append(labels[0])
+                if valid_map and len(group_psu_labels) == n_groups_for_overall_var:
+                    # Dense integer indices for _generate_psu_or_group_weights.
+                    _, group_to_psu_map_bootstrap = np.unique(
+                        np.asarray(group_psu_labels),
+                        return_inverse=True,
+                    )
+                    group_to_psu_map_bootstrap = np.asarray(
+                        group_to_psu_map_bootstrap, dtype=np.int64
+                    )
+
             br = self._compute_dcdh_bootstrap(
                 n_groups_for_overall=n_groups_for_overall_var,
                 u_centered_overall=U_centered_overall,
@@ -2038,6 +2152,7 @@ def fit(
                 placebo_inputs=placebo_inputs,
                 multi_horizon_inputs=mh_boot_inputs,
                 placebo_horizon_inputs=pl_boot_inputs,
+                group_to_psu_map=group_to_psu_map_bootstrap,
             )
             bootstrap_results = br
 
@@ -2496,6 +2611,7 @@ def fit(
                 rank_deficient_action=self.rank_deficient_action,
                 group_ids_order=np.array(all_groups),
                 obs_survey_info=_obs_survey_info,
+                replicate_n_valid_list=_replicate_n_valid_list,
             )
 
         twfe_weights_df = None
@@ -3278,6 +3394,7 @@ def _compute_heterogeneity_test(
     rank_deficient_action: str = "warn",
     group_ids_order: Optional[np.ndarray] = None,
     obs_survey_info: Optional[Dict[str, Any]] = None,
+    replicate_n_valid_list: Optional[List[int]] = None,
 ) -> Dict[int, Dict[str, Any]]:
     """Test for heterogeneous treatment effects (Web Appendix Section 1.5).
 
@@ -3323,14 +3440,25 @@ def _compute_heterogeneity_test(
         obs_survey_info is not None and group_ids_order is not None
     )
     if use_survey:
-        from diff_diff.survey import compute_survey_if_variance
+        from diff_diff.survey import (
+            compute_replicate_if_variance,
+            compute_survey_if_variance,
+        )
 
         obs_gids_raw = np.asarray(obs_survey_info["group_ids"])
         obs_w_raw = np.asarray(obs_survey_info["weights"], dtype=np.float64)
         resolved = obs_survey_info["resolved"]
-        df_s = (
-            resolved.df_survey if resolved is not None else None
-        )
+        # df_s starts from the design-level df (n_psu - n_strata under TSL,
+        # R - 1 under replicate). It may be reduced further by replicate
+        # failures reported via replicate_n_valid_list (this function
+        # appends its own n_valid observations).
+        if (
+            replicate_n_valid_list is not None
+            and len(replicate_n_valid_list) > 0
+        ):
+            df_s = int(min(replicate_n_valid_list)) - 1
+        else:
+            df_s = resolved.df_survey if resolved is not None else None
         # Contract: only obs whose group is in the canonical post-filter
         # list contribute. Groups dropped upstream (Step 5b interior gaps,
         # Step 6 multi-switch) appear in obs_gids_raw but must be
@@ -3484,15 +3612,33 @@ def _compute_heterogeneity_test(
                         obs_w_raw[mask_g] / w_sum_g
                     )
 
-            # Binder TSL variance across stratified PSUs.
-            var_s = compute_survey_if_variance(psi_obs, resolved)
+            # Dispatch: replicate-weight variance (BRR/Fay/JK1/JKn/SDR)
+            # vs Binder TSL across stratified PSUs. Mirrors the inline
+            # branch in _survey_se_from_group_if and the pattern in
+            # TripleDifference:1206-1238. Heterogeneity uses WLS with
+            # full-sample weights; theta_hat is treated as fixed per the
+            # FWL plug-in IF convention (REGISTRY.md Note on heterogeneity
+            # under replicate — no per-replicate refits).
+            if getattr(resolved, "uses_replicate_variance", False):
+                var_s, n_valid_het = compute_replicate_if_variance(
+                    psi_obs, resolved
+                )
+                if replicate_n_valid_list is not None:
+                    replicate_n_valid_list.append(n_valid_het)
+                # Reduce df_s to reflect this horizon's n_valid.
+                df_s_local = int(n_valid_het) - 1 if df_s is None else min(
+                    df_s, int(n_valid_het) - 1
+                )
+            else:
+                var_s = compute_survey_if_variance(psi_obs, resolved)
+                df_s_local = df_s
             se_het = (
                 float(np.sqrt(var_s))
                 if np.isfinite(var_s) and var_s > 0
                 else float("nan")
             )
             t_stat, p_val, ci = safe_inference(
-                beta_het, se_het, alpha=alpha, df=df_s
+                beta_het, se_het, alpha=alpha, df=df_s_local
             )
 
         results[l_h] = {
@@ -4812,24 +4958,51 @@ def _validate_group_constant_strata_psu(
             )
 
 
+def _effective_df_survey(
+    resolved_survey: Any,
+    replicate_n_valid_list: List[int],
+) -> Optional[int]:
+    """Compute the effective ``df_survey`` for t-critical values.
+
+    Under replicate-weight designs, each IF site returns an ``n_valid``
+    count (number of replicate columns that produced a finite estimate).
+    This helper returns ``min(n_valid) - 1`` across sites, matching the
+    precedent in ``diff_diff/efficient_did.py:1133-1135`` and
+    ``diff_diff/triple_diff.py:676-686``. Under TSL (analytical) or
+    non-survey fits, ``replicate_n_valid_list`` is empty and this falls
+    through to ``resolved_survey.df_survey`` (or ``None`` when there's
+    no survey design at all).
+    """
+    if resolved_survey is None:
+        return None
+    if replicate_n_valid_list:
+        return int(min(replicate_n_valid_list)) - 1
+    return resolved_survey.df_survey
+
+
 def _compute_se(
     U_centered: np.ndarray,
     divisor: int,
     obs_survey_info: Optional[dict],
     eligible_groups: Optional[list] = None,
-) -> float:
+) -> Tuple[float, Optional[int]]:
     """Dispatch to plug-in SE or survey-design-aware SE.
 
     When ``obs_survey_info`` is ``None``, falls back to the simple
     plug-in formula.  Otherwise, expands group-level IFs and delegates
-    to TSL variance.
+    to TSL variance (or replicate-weight variance when the resolved
+    design carries replicate weights).
+
+    Returns ``(se, n_valid_replicates)``. ``n_valid_replicates`` is
+    ``None`` under the plug-in / TSL paths, and the number of valid
+    replicate columns under the replicate path.
     """
     if obs_survey_info is None:
-        return _plugin_se(U_centered=U_centered, divisor=divisor)
+        return _plugin_se(U_centered=U_centered, divisor=divisor), None
     if eligible_groups is None:
-        return _plugin_se(U_centered=U_centered, divisor=divisor)
+        return _plugin_se(U_centered=U_centered, divisor=divisor), None
     if divisor <= 0:
-        return float("nan")
+        return float("nan"), None
     # dCDH IFs are numerator-scale (U.sum() == N_S * DID_M).
     # compute_survey_if_variance() expects estimator-scale psi.
     # Scale by 1/divisor to normalize before survey expansion.
@@ -4845,16 +5018,20 @@ def _survey_se_from_group_if(
     U_centered: np.ndarray,
     eligible_groups: list,
     obs_survey_info: dict,
-) -> float:
+) -> Tuple[float, Optional[int]]:
     """Compute survey-design-aware SE from group-level centered IFs.
 
     Expands group-level influence function values to observation level
     using the proportional-weight allocation ``psi_i = U[g] * (w_i / W_g)``
-    so that ``sum(psi_i for i in g) = U[g]``, then delegates to
-    ``compute_survey_if_variance()`` for the TSL design-based variance.
+    so that ``sum(psi_i for i in g) = U[g]``, then delegates to either
+    ``compute_survey_if_variance()`` (TSL / analytical path) or
+    ``compute_replicate_if_variance()`` (BRR/Fay/JK1/JKn/SDR) depending
+    on whether the resolved design carries replicate weights.
 
     This is a library extension not in the dCDH papers (the paper's
-    plug-in variance assumes iid sampling).
+    plug-in variance assumes iid sampling). The replicate path uses
+    Rao-Wu weight-ratio rescaling; see REGISTRY.md
+    ``ChaisemartinDHaultfoeuille`` Note on survey expansion.
 
     Parameters
     ----------
@@ -4869,10 +5046,18 @@ def _survey_se_from_group_if(
 
     Returns
     -------
-    float
-        Survey-design-aware SE, or NaN if degenerate.
+    Tuple[float, Optional[int]]
+        ``(se, n_valid_replicates)``. ``se`` is the survey-design-aware
+        SE, or NaN if degenerate. ``n_valid_replicates`` is the number
+        of valid replicate columns used for variance under the replicate
+        path, or ``None`` under the TSL (analytical) path. Callers
+        track the min across sites to set an effective ``df_survey``
+        for replicate designs.
     """
-    from diff_diff.survey import compute_survey_if_variance
+    from diff_diff.survey import (
+        compute_replicate_if_variance,
+        compute_survey_if_variance,
+    )
 
     # Degenerate-cohort contract (mirror _plugin_se): when the centered
     # IF is empty or every cohort is a singleton (→ recentered IF is
@@ -4882,9 +5067,9 @@ def _survey_se_from_group_if(
     # through _compute_se (overall, joiners/leavers, multi-horizon
     # ATT, placebos, normalized/cumulated, heterogeneity).
     if U_centered.size == 0:
-        return float("nan")
+        return float("nan"), None
     if float((U_centered ** 2).sum()) <= 0:
-        return float("nan")
+        return float("nan"), None
 
     group_ids = obs_survey_info["group_ids"]
     weights = obs_survey_info["weights"]
@@ -4902,7 +5087,7 @@ def _survey_se_from_group_if(
     psi = np.zeros(n_obs, dtype=np.float64)
 
     if not pos_mask.any():
-        return float("nan")
+        return float("nan"), None
 
     gids_eff = np.asarray(group_ids)[pos_mask]
     w_eff = weights_arr[pos_mask]
@@ -4922,10 +5107,20 @@ def _survey_se_from_group_if(
     safe_w = np.where(w_obs_total_eff > 0, w_obs_total_eff, 1.0)
     psi[pos_mask] = u_obs_eff * (w_eff / safe_w)
 
+    # Dispatch: replicate variance (BRR/Fay/JK1/JKn/SDR) vs TSL.
+    # Mirrors the inline branch in TripleDifference:1206-1238 and
+    # EfficientDiD:1119-1142. Under replicate, returns (variance, n_valid);
+    # the caller propagates min(n_valid) to df_survey.
+    if getattr(resolved, "uses_replicate_variance", False):
+        variance, n_valid = compute_replicate_if_variance(psi, resolved)
+        if not np.isfinite(variance) or variance < 0:
+            return float("nan"), n_valid
+        return float(np.sqrt(variance)), n_valid
+
     variance = compute_survey_if_variance(psi, resolved)
     if not np.isfinite(variance) or variance < 0:
-        return float("nan")
-    return float(np.sqrt(variance))
+        return float("nan"), None
+    return float(np.sqrt(variance)), None
 
 
 def _build_group_time_design(
@@ -5240,16 +5435,13 @@ def twowayfeweights(
                 f"The TWFE diagnostic under survey uses survey-weighted cell "
                 f"means; other weight types are not supported."
             )
-        if (
-            resolved is not None
-            and resolved.replicate_weights is not None
-            and resolved.replicate_weights.shape[1] > 0
-        ):
-            raise NotImplementedError(
-                "Replicate weight variance for twowayfeweights() is not "
-                "supported. Use strata/PSU/FPC for design-based inference "
-                "via Taylor Series Linearization (matches the fit() path)."
-            )
+        # Replicate-weight designs are accepted. TWFE diagnostic has no
+        # SE field on TWFEWeightsResult — the replicate weights only
+        # participate through the full-sample weight (resolved.weights),
+        # which drives survey-weighted cell aggregation in
+        # _validate_and_aggregate_to_cells. Diagnostic numbers
+        # (beta_fe, sigma_fe, fraction_negative) match
+        # fit(..., survey_design=sd).twfe_* under replicate input.
 
     # Validation + cell aggregation via the same helper used by
     # ChaisemartinDHaultfoeuille.fit() — enforces NaN/binary/within-cell
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
index 7eea75cf..06eec38a 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
@@ -70,6 +70,8 @@ def _compute_dcdh_bootstrap(
         # --- Phase 2: multi-horizon inputs ---
         multi_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None,
         placebo_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None,
+        # --- Survey: PSU-level bootstrap under survey designs ---
+        group_to_psu_map: Optional[np.ndarray] = None,
     ) -> DCDHBootstrapResults:
         """
         Compute multiplier-bootstrap inference for all dCDH targets.
@@ -175,6 +177,7 @@ def _compute_dcdh_bootstrap(
                 rng=rng,
                 context="dCDH overall DID_M bootstrap",
                 return_distribution=True,
+                group_to_psu_map=group_to_psu_map,
             )
         else:
             overall_se = np.nan
@@ -206,6 +209,7 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH joiners DID_+ bootstrap",
                     return_distribution=False,
+                    group_to_psu_map=_slice_psu_map(group_to_psu_map, u_j.size),
                 )
                 results.joiners_se = se_j
                 results.joiners_ci = ci_j
@@ -225,6 +229,7 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH leavers DID_- bootstrap",
                     return_distribution=False,
+                    group_to_psu_map=_slice_psu_map(group_to_psu_map, u_l.size),
                 )
                 results.leavers_se = se_l
                 results.leavers_ci = ci_l
@@ -244,6 +249,7 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH placebo DID_M^pl bootstrap",
                     return_distribution=False,
+                    group_to_psu_map=_slice_psu_map(group_to_psu_map, u_pl.size),
                 )
                 results.placebo_se = se_pl
                 results.placebo_ci = ci_pl
@@ -259,13 +265,18 @@ def _compute_dcdh_bootstrap(
             es_pvals: Dict[int, float] = {}
             es_dists: Dict[int, np.ndarray] = {}
 
-            # Shared weight matrix sized for the group set
+            # Shared weight matrix sized for the group set. Under PSU-level
+            # bootstrap (Hall-Mammen wild PSU), weights are drawn once per
+            # PSU and broadcast to groups so all groups in the same PSU
+            # share a multiplier within a single bootstrap replicate —
+            # preserving the sup-t joint distribution across horizons.
             n_groups_mh = n_groups_for_overall
-            shared_weights = _generate_bootstrap_weights_batch(
+            shared_weights = _generate_psu_or_group_weights(
                 n_bootstrap=self.n_bootstrap,
-                n_units=n_groups_mh,
+                n_groups_target=n_groups_mh,
                 weight_type=self.bootstrap_weights,
                 rng=rng,
+                group_to_psu_map=group_to_psu_map,
             )
 
             for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()):
@@ -324,6 +335,7 @@ def _compute_dcdh_bootstrap(
                         rng=rng,
                         context=f"dCDH placebo l={l_h} bootstrap",
                         return_distribution=False,
+                        group_to_psu_map=_slice_psu_map(group_to_psu_map, u_h.size),
                     )
                     pl_ses[l_h] = se_h
                     pl_cis[l_h] = ci_h
@@ -341,6 +353,93 @@ def _compute_dcdh_bootstrap(
 # =============================================================================
 
 
+def _slice_psu_map(
+    group_to_psu_map: Optional[np.ndarray],
+    target_size: int,
+) -> Optional[np.ndarray]:
+    """Slice a full-set group-to-PSU map to a target's contributing group count.
+
+    Multi-horizon / joiner / leaver targets share the overall variance-
+    eligible group ordering but may have fewer contributing groups
+    (``u_centered.shape[0] <= n_groups_for_overall``). Slice the first
+    ``target_size`` entries — this mirrors the existing shared-weights
+    truncation at the multi-horizon bootstrap site. Returns ``None``
+    when no map is provided (plain multiplier-bootstrap path).
+    """
+    if group_to_psu_map is None:
+        return None
+    if target_size <= 0 or target_size > len(group_to_psu_map):
+        return group_to_psu_map
+    return group_to_psu_map[:target_size]
+
+
+def _generate_psu_or_group_weights(
+    n_bootstrap: int,
+    n_groups_target: int,
+    weight_type: str,
+    rng: np.random.Generator,
+    group_to_psu_map: Optional[np.ndarray],
+) -> np.ndarray:
+    """Generate a group-level weight matrix, possibly via PSU broadcasting.
+
+    When ``group_to_psu_map`` is ``None`` or is the identity (each group
+    is its own PSU), generates weights at the group level directly —
+    bit-identical to the pre-PSU-bootstrap contract.
+
+    When ``group_to_psu_map`` has fewer unique values than
+    ``n_groups_target`` (strictly coarser PSU than group), generates
+    weights at the PSU level and broadcasts to groups via the map.
+    This is the Hall-Mammen wild PSU bootstrap.
+
+    Parameters
+    ----------
+    n_bootstrap, weight_type, rng
+        Passed through to generate_bootstrap_weights_batch.
+    n_groups_target : int
+        Number of groups contributing to the target's IF vector.
+    group_to_psu_map : np.ndarray or None
+        Dense integer PSU indices of shape ``(n_groups_target,)``.
+        ``None`` triggers the group-level path.
+
+    Returns
+    -------
+    np.ndarray
+        Shape ``(n_bootstrap, n_groups_target)`` multiplier weights.
+    """
+    if group_to_psu_map is None:
+        return _generate_bootstrap_weights_batch(
+            n_bootstrap=n_bootstrap,
+            n_units=n_groups_target,
+            weight_type=weight_type,
+            rng=rng,
+        )
+    if len(group_to_psu_map) != n_groups_target:
+        raise ValueError(
+            f"group_to_psu_map length ({len(group_to_psu_map)}) does not "
+            f"match n_groups_target ({n_groups_target})."
+        )
+    n_psu = int(np.max(group_to_psu_map)) + 1 if group_to_psu_map.size > 0 else 0
+    if n_psu >= n_groups_target:
+        # Identity (each group its own PSU) — skip the broadcast for a
+        # bit-identical fast path matching the pre-PSU-bootstrap behavior.
+        return _generate_bootstrap_weights_batch(
+            n_bootstrap=n_bootstrap,
+            n_units=n_groups_target,
+            weight_type=weight_type,
+            rng=rng,
+        )
+    # Hall-Mammen wild PSU bootstrap: draw n_psu multipliers, broadcast
+    # via the dense index map so all groups in the same PSU share a
+    # multiplier. Preserves clustered sampling structure.
+    psu_weights = _generate_bootstrap_weights_batch(
+        n_bootstrap=n_bootstrap,
+        n_units=n_psu,
+        weight_type=weight_type,
+        rng=rng,
+    )
+    return psu_weights[:, group_to_psu_map]
+
+
 def _bootstrap_one_target(
     u_centered: np.ndarray,
     divisor: int,
@@ -351,6 +450,7 @@ def _bootstrap_one_target(
     rng: np.random.Generator,
     context: str,
     return_distribution: bool,
+    group_to_psu_map: Optional[np.ndarray] = None,
 ) -> Tuple[float, Tuple[float, float], float, Optional[np.ndarray]]:
     """
     Run the multiplier bootstrap for a single dCDH target.
@@ -367,16 +467,24 @@ def _bootstrap_one_target(
     sample mean of the bootstrap distribution should be approximately
     zero, not the original effect). The original effect is passed
     separately as the centering point for the percentile p-value.
+
+    When ``group_to_psu_map`` is provided (length ``len(u_centered)``,
+    dense integer PSU indices), multiplier weights are generated at the
+    PSU level and broadcast to groups so all groups in the same PSU
+    receive the same bootstrap multiplier. This is the Hall-Mammen wild
+    PSU bootstrap; it reduces to the group-level bootstrap when each
+    group is its own PSU (identity map).
     """
     n_groups_target = u_centered.shape[0]
     if n_groups_target == 0 or divisor == 0:
         return np.nan, (np.nan, np.nan), np.nan, None
 
-    weight_matrix = _generate_bootstrap_weights_batch(
+    weight_matrix = _generate_psu_or_group_weights(
         n_bootstrap=n_bootstrap,
-        n_units=n_groups_target,
+        n_groups_target=n_groups_target,
         weight_type=weight_type,
         rng=rng,
+        group_to_psu_map=group_to_psu_map,
     )
 
     # Each bootstrap replicate: (1 / divisor) * sum_g w_b[g] * u_centered[g]
diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt
index 7e6ef60b..5efb8f13 100644
--- a/diff_diff/guides/llms-full.txt
+++ b/diff_diff/guides/llms-full.txt
@@ -266,7 +266,7 @@ est.fit(
     trends_nonparam: Any | None = None,          # Phase 3: DID^s
     honest_did: bool = False,                    # Phase 3: HonestDiD integration
     # ---- survey support ----
-    survey_design: SurveyDesign | None = None,    # pweight + strata/PSU/FPC (TSL)
+    survey_design: SurveyDesign | None = None,    # pweight + strata/PSU/FPC (TSL) OR BRR/Fay/JK1/JKn/SDR replicate weights; n_bootstrap > 0 uses Hall-Mammen PSU multiplier
 ) -> ChaisemartinDHaultfoeuilleResults
 ```
 
@@ -1672,8 +1672,8 @@ sd_female, data_female = sd.subpopulation(data, mask=lambda df: df['sex'] == 'F'
 
 **Key features:**
 - Taylor Series Linearization (TSL) variance with strata + PSU + FPC
-- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (12 of 16 estimators)
-- Survey-aware bootstrap: multiplier at PSU or Rao-Wu rescaled
+- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (13 of 16 estimators, including dCDH)
+- Survey-aware bootstrap: multiplier at PSU (Hall-Mammen wild; dCDH, staggered) or Rao-Wu rescaled (SyntheticDiD, SunAbraham)
 - DEFF diagnostics, subpopulation analysis, weight trimming (`trim_weights`)
 - Repeated cross-sections: `CallawaySantAnna(panel=False)`
 - Compatibility matrix: see `docs/choosing_estimator.rst` Survey Design Support section
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 0bf3c5de..60444f9e 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -648,9 +648,9 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 - [x] Heterogeneity testing via saturated OLS (Web Appendix Section 1.5, Lemma 7)
 - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6)
 - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface
-- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Replicate weights and PSU-level bootstrap deferred.
-- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=<group column>` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally).
-- **Note (survey + bootstrap fallback):** When `survey_design` and `n_bootstrap > 0` are both active, the multiplier bootstrap uses group-level Rademacher/Mammen/Webb weights rather than PSU-level resampling. A `UserWarning` is emitted from `fit()`. This is conservative when groups are finer than PSUs; a PSU-level survey bootstrap is deferred to a future release. For design-based analytical variance, the TSL path (non-bootstrap) is the recommended contract.
+- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`.
+- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=<group column>` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). Under replicate-weight designs, the same group-level IF expansion is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed.
+- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(n_valid_across_sites) - 1` propagates to t-critical values.
 
 ---
 
diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py
index c2c7c0f0..1aa9f14c 100644
--- a/tests/test_survey_dcdh.py
+++ b/tests/test_survey_dcdh.py
@@ -317,8 +317,16 @@ def test_varying_weights_change_att(self, base_data):
             result_survey.overall_att, abs=0.01
         )
 
-    def test_rejects_replicate_weights(self, base_data):
-        """Replicate weight variance not yet supported."""
+    def test_rejects_replicate_weights_with_bootstrap(self, base_data):
+        """Replicate weights combined with n_bootstrap > 0 is rejected.
+
+        Replicate variance is closed-form (compute_replicate_if_variance);
+        combining it with a multiplier bootstrap would double-count
+        variance. Matches library precedent in efficient_did.py:989,
+        staggered.py:1869, two_stage.py:251-253. The standalone
+        replicate-only path (n_bootstrap=0) is supported separately;
+        see tests/test_survey_dcdh_replicate_psu.py.
+        """
         df = base_data.copy()
         df["pw"] = 1.0
         df["rep1"] = 1.0
@@ -328,8 +336,8 @@ def test_rejects_replicate_weights(self, base_data):
             replicate_weights=["rep1", "rep2"],
             replicate_method="BRR",
         )
-        with pytest.raises(NotImplementedError, match="Replicate"):
-            ChaisemartinDHaultfoeuille().fit(
+        with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"):
+            ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit(
                 df,
                 outcome="outcome",
                 group="group",
@@ -366,9 +374,31 @@ def test_multi_horizon_survey_runs(self, data_with_survey):
 
 class TestBootstrapSurveyWarning:
 
-    def test_bootstrap_survey_emits_warning(self, data_with_survey):
+    def test_bootstrap_survey_auto_inject_no_warning(self, data_with_survey):
+        """Under auto-inject psu=group, Hall-Mammen wild PSU bootstrap
+        coincides with the group-level multiplier bootstrap — so no
+        warning should fire. The old 'PSU-level deferred' warning has
+        been replaced with a conditional one that only fires when the
+        user passes a strictly coarser PSU.
+
+        See the new test file tests/test_survey_dcdh_replicate_psu.py for
+        the strictly-coarser-PSU case where the warning DOES fire.
+        """
         sd = SurveyDesign(weights="pw")
-        with pytest.warns(UserWarning, match="group-level multiplier"):
+        import warnings as _w
+
+        with _w.catch_warnings():
+            _w.simplefilter("error", UserWarning)
+            # Ignore warnings unrelated to the bootstrap-PSU contract.
+            _w.filterwarnings(
+                "ignore", message="Single-period placebo SE"
+            )
+            _w.filterwarnings(
+                "ignore", message="pweight weights normalized"
+            )
+            _w.filterwarnings(
+                "ignore", message="Assumption 11"
+            )
             ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit(
                 data_with_survey,
                 outcome="outcome",
@@ -728,9 +758,11 @@ def test_twfe_helper_rejects_non_pweight(self, base_data):
                 survey_design=sd,
             )
 
-    def test_twfe_helper_rejects_replicate_weights(self, base_data):
-        """Replicate-weight survey designs must be rejected by the helper,
-        matching fit()'s NotImplementedError contract."""
+    def test_twfe_helper_accepts_replicate_weights(self, base_data):
+        """Replicate-weight designs are accepted by the helper (no SE field
+        on TWFEWeightsResult, so only cell aggregation is affected). Matches
+        fit()'s new contract where replicate variance runs analytically via
+        compute_replicate_if_variance."""
         from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights
 
         df_ = base_data.copy()
@@ -742,13 +774,15 @@ def test_twfe_helper_rejects_replicate_weights(self, base_data):
             replicate_weights=["rep1", "rep2"],
             replicate_method="BRR",
         )
-        with pytest.raises(NotImplementedError, match="Replicate"):
-            twowayfeweights(
-                df_,
-                outcome="outcome", group="group",
-                time="period", treatment="treatment",
-                survey_design=sd,
-            )
+        result = twowayfeweights(
+            df_,
+            outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(result.beta_fe)
+        assert np.isfinite(result.sigma_fe)
+        assert 0.0 <= result.fraction_negative <= 1.0
 
 
 # ── Test: TWFE diagnostic oracle under survey ───────────────────────
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
new file mode 100644
index 00000000..aa1be454
--- /dev/null
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -0,0 +1,592 @@
+"""Replicate-weight variance and PSU-level bootstrap tests for dCDH.
+
+Covers the two survey-variance extensions added after PR #307:
+1. Replicate-weight variance (BRR/Fay/JK1/JKn/SDR) via the unified
+   `compute_replicate_if_variance` helper, routed through the inline
+   branch in `_survey_se_from_group_if` (Class A sites) and the
+   parallel inline branch in `_compute_heterogeneity_test` (Class B).
+2. PSU-level Hall-Mammen wild bootstrap via `group_to_psu_map`
+   threaded through `_compute_dcdh_bootstrap`, with an identity-map
+   fast path preserving auto-inject `psu=group` semantics.
+"""
+
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from diff_diff import ChaisemartinDHaultfoeuille, SurveyDesign
+from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights
+
+
+# ── Fixtures ────────────────────────────────────────────────────────
+
+
+REPLICATE_METHODS = ["BRR", "Fay", "JK1", "JKn", "SDR"]
+
+
+def _make_reversible_panel(
+    n_groups: int = 30,
+    n_periods: int = 5,
+    seed: int = 42,
+) -> pd.DataFrame:
+    """Simple reversible-treatment panel with two switch cohorts."""
+    rng = np.random.default_rng(seed)
+    rows = []
+    for g in range(n_groups):
+        f = 2 if g < n_groups // 2 else 3
+        for t in range(n_periods):
+            d = 1 if t >= f else 0
+            y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3)
+            rows.append(
+                {
+                    "group": g,
+                    "period": t,
+                    "treatment": d,
+                    "outcome": y,
+                    "pw": 1.0,
+                    "x_het": float(g) * 0.1,
+                }
+            )
+    return pd.DataFrame(rows)
+
+
+def _attach_replicate_weights(
+    df: pd.DataFrame,
+    R: int,
+    method: str,
+    seed: int = 101,
+) -> pd.DataFrame:
+    """Add R replicate-weight columns and return the mutated DataFrame."""
+    rng = np.random.default_rng(seed)
+    df = df.copy()
+    for r in range(R):
+        if method == "BRR" or method == "Fay" or method == "SDR":
+            df[f"rep{r}"] = rng.choice([0.5, 1.5], size=len(df))
+        elif method in ("JK1", "JKn"):
+            df[f"rep{r}"] = rng.uniform(0.5, 1.5, size=len(df))
+    return df
+
+
+def _rep_cols(R: int) -> list:
+    return [f"rep{r}" for r in range(R)]
+
+
+def _build_replicate_design(R: int, method: str) -> SurveyDesign:
+    """Build a SurveyDesign object with method-specific required params."""
+    if method == "Fay":
+        return SurveyDesign(
+            weights="pw",
+            replicate_weights=_rep_cols(R),
+            replicate_method="Fay",
+            fay_rho=0.5,
+        )
+    if method == "JKn":
+        return SurveyDesign(
+            weights="pw",
+            replicate_weights=_rep_cols(R),
+            replicate_method="JKn",
+            replicate_strata=[r % 2 for r in range(R)],
+        )
+    return SurveyDesign(
+        weights="pw",
+        replicate_weights=_rep_cols(R),
+        replicate_method=method,
+    )
+
+
+@pytest.fixture(scope="module")
+def base_panel():
+    return _make_reversible_panel(n_groups=30, n_periods=5, seed=42)
+
+
+@pytest.fixture
+def replicate_design():
+    """Helper constructing a SurveyDesign with replicate weights for a given method."""
+
+    def _build(df: pd.DataFrame, R: int, method: str, seed: int = 101) -> pd.DataFrame:
+        return _attach_replicate_weights(df, R=R, method=method, seed=seed)
+
+    return _build
+
+
+def _make_strictly_coarser_psu_panel(
+    n_groups_per_psu: int = 3,
+    n_psu: int = 4,
+    n_periods: int = 5,
+    seed: int = 17,
+) -> pd.DataFrame:
+    """Panel where groups nest strictly inside PSUs.
+
+    12 groups across 4 PSUs (3 groups/PSU). Two switch cohorts so the
+    estimator has both joiners at different F_g.
+    """
+    rng = np.random.default_rng(seed)
+    n_groups = n_groups_per_psu * n_psu
+    rows = []
+    for g in range(n_groups):
+        psu = g // n_groups_per_psu
+        f = 2 if g < n_groups // 2 else 3
+        for t in range(n_periods):
+            d = 1 if t >= f else 0
+            y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3)
+            rows.append(
+                {
+                    "group": g,
+                    "period": t,
+                    "treatment": d,
+                    "outcome": y,
+                    "pw": 1.0,
+                    "psu": psu,
+                }
+            )
+    return pd.DataFrame(rows)
+
+
+# ── 1. Class A replicate (sites via _survey_se_from_group_if) ────────
+
+
+class TestReplicateClassA:
+    """Replicate variance wired through the inline branch in
+    `_survey_se_from_group_if` — inherited by overall DID_M, joiners
+    DID_+, leavers DID_-, multi-horizon DID_l, placebo DID^pl_l.
+    """
+
+    @pytest.mark.parametrize("method", REPLICATE_METHODS)
+    def test_overall_se_finite(self, base_panel, replicate_design, method):
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(res.overall_att)
+        assert np.isfinite(res.overall_se)
+        assert res.overall_se > 0
+
+    @pytest.mark.parametrize("method", REPLICATE_METHODS)
+    def test_inference_fields_finite(self, base_panel, replicate_design, method):
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(res.overall_t_stat)
+        assert np.isfinite(res.overall_p_value)
+        assert np.all(np.isfinite(res.overall_conf_int))
+
+    @pytest.mark.parametrize("method", ["BRR", "JK1"])
+    def test_df_survey_reflects_n_valid(self, base_panel, replicate_design, method):
+        R = 24
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+        )
+        # All replicates should succeed on a clean panel → df = n_valid - 1 = R - 1
+        assert res.survey_metadata.df_survey == R - 1
+
+    def test_jk1_converges_to_tsl(self, replicate_design):
+        """JK1 replicate SE should be in the same order of magnitude as
+        analytical TSL SE on a large panel. Uses a loose 50% tolerance
+        because convergence depends on weight-construction details; the
+        stringent R-parity convergence test lives in
+        `test_survey_phase6.py`."""
+        df = _make_reversible_panel(n_groups=60, n_periods=6, seed=42)
+        R = 30
+        df_rep = replicate_design(df, R=R, method="JK1")
+
+        sd_rep = _build_replicate_design(R, "JK1")
+        sd_tsl = SurveyDesign(weights="pw")
+
+        res_rep = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_rep,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_rep,
+        )
+        res_tsl = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_tsl,
+        )
+        # Same point estimate (both unweighted)
+        assert res_rep.overall_att == pytest.approx(res_tsl.overall_att, rel=1e-6)
+        # SEs should be within 50% of each other (broad envelope)
+        ratio = res_rep.overall_se / res_tsl.overall_se
+        assert 0.5 <= ratio <= 2.0, (
+            f"Replicate SE ({res_rep.overall_se:.4f}) and TSL SE "
+            f"({res_tsl.overall_se:.4f}) differ by more than 2x."
+        )
+
+    @pytest.mark.parametrize("method", ["BRR", "JK1"])
+    def test_multi_horizon_under_replicate(
+        self, replicate_design, method
+    ):
+        """Multi-horizon DID_l inherits the Class A dispatch — horizon 1
+        (always identifiable on this panel) should produce a finite
+        replicate-based SE."""
+        df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42)
+        R = 20
+        df = replicate_design(df, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            L_max=1,
+        )
+        info = res.event_study_effects[1]
+        assert np.isfinite(info["effect"])
+        assert np.isfinite(info["se"]) and info["se"] > 0
+
+    def test_placebo_under_replicate(self, replicate_design):
+        """Placebo DID^{pl}_l under replicate weights also carries finite SE."""
+        df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42)
+        R = 20
+        df = replicate_design(df, R=R, method="BRR")
+        sd = _build_replicate_design(R, "BRR")
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            L_max=1,
+        )
+        if res.placebo_event_study:
+            info = res.placebo_event_study.get(1)
+            if info is not None and info.get("n_obs", 0) > 0:
+                assert np.isfinite(info["se"])
+
+    def test_did_x_replicate(self, base_panel, replicate_design):
+        """DID^X covariate adjustment flows through the Class A dispatch
+        (Y_mat residualization happens upstream of IF)."""
+        df = base_panel.copy()
+        rng = np.random.default_rng(123)
+        df["cov1"] = rng.normal(size=len(df))
+        R = 20
+        df = replicate_design(df, R=R, method="JK1")
+        sd = _build_replicate_design(R, "JK1")
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            controls=["cov1"],
+            L_max=1,
+        )
+        assert np.isfinite(res.overall_att)
+        assert np.isfinite(res.overall_se) and res.overall_se > 0
+
+
+# ── 2. Class B replicate (heterogeneity + twowayfeweights) ──────────
+
+
+class TestReplicateClassB:
+    """Sites that call `compute_survey_if_variance` directly (or don't
+    route through the variance path at all) and need their own
+    replicate handling."""
+
+    @pytest.mark.parametrize("method", REPLICATE_METHODS)
+    def test_heterogeneity_se_finite(self, base_panel, replicate_design, method):
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            L_max=1,
+            heterogeneity="x_het",
+        )
+        assert res.heterogeneity_effects is not None
+        info = res.heterogeneity_effects[1]
+        assert np.isfinite(info["beta"])
+        assert np.isfinite(info["se"]) and info["se"] > 0
+
+    def test_twowayfeweights_accepts_replicate(self, base_panel, replicate_design):
+        """TWFE diagnostic produces the same beta_fe / sigma_fe /
+        fraction_negative under a replicate design as under a
+        pweight-only design (replicate weights only affect
+        `resolved.weights` cell aggregation)."""
+        R = 20
+        df = replicate_design(base_panel, R=R, method="BRR")
+        sd_rep = _build_replicate_design(R, "BRR")
+        sd_plain = SurveyDesign(weights="pw")
+
+        res_rep = twowayfeweights(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_rep,
+        )
+        res_plain = twowayfeweights(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_plain,
+        )
+        # Cell aggregation uses resolved.weights; replicate_weights are
+        # present but don't change the aggregated diagnostics.
+        assert res_rep.beta_fe == pytest.approx(res_plain.beta_fe, rel=1e-10)
+        assert res_rep.sigma_fe == pytest.approx(res_plain.sigma_fe, rel=1e-10)
+        assert res_rep.fraction_negative == pytest.approx(
+            res_plain.fraction_negative, rel=1e-10
+        )
+
+
+# ── 3. PSU-level Hall-Mammen wild bootstrap ─────────────────────────
+
+
+class TestPSUBootstrap:
+
+    def test_auto_inject_bit_identical_to_group_level(self, base_panel):
+        """Under auto-inject psu=group the Hall-Mammen PSU bootstrap
+        collapses to the group-level multiplier bootstrap via the
+        identity-map fast path — producing bit-identical SE."""
+        df = base_panel.copy()
+        sd_auto = SurveyDesign(weights="pw")
+        sd_explicit = SurveyDesign(weights="pw", psu="group")
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            r_auto = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd_auto,
+            )
+            r_explicit = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd_explicit,
+            )
+        assert r_auto.overall_se == pytest.approx(r_explicit.overall_se, rel=1e-10)
+
+    def test_coarser_psu_produces_finite_se(self):
+        """Under a strictly coarser PSU, the Hall-Mammen PSU bootstrap
+        still produces a finite SE (the value depends on within-PSU
+        IF correlation)."""
+        df = _make_strictly_coarser_psu_panel()
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+        assert np.isfinite(res.overall_att)
+        assert np.isfinite(res.overall_se) and res.overall_se > 0
+
+    def test_no_warning_under_auto_inject(self, base_panel):
+        """No UserWarning about PSU-level bootstrap should fire under
+        auto-inject psu=group (the warning only fires for strictly
+        coarser PSU)."""
+        sd = SurveyDesign(weights="pw")
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit(
+                base_panel,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+        assert not any(
+            "Hall-Mammen wild" in str(w.message)
+            or "group-level multiplier" in str(w.message)
+            for w in caught
+        ), (
+            "Bootstrap-PSU warning should not fire under auto-inject psu=group, "
+            f"but got: {[str(w.message) for w in caught]}"
+        )
+
+    def test_warning_under_coarser_psu(self):
+        """When PSU is strictly coarser than group, the Hall-Mammen
+        warning fires."""
+        df = _make_strictly_coarser_psu_panel()
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with pytest.warns(UserWarning, match="Hall-Mammen wild"):
+            ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+
+    @pytest.mark.parametrize("weight_type", ["rademacher", "mammen", "webb"])
+    def test_all_weight_types_under_psu(self, weight_type):
+        """Rademacher / Mammen / Webb multipliers should all produce
+        finite SE under the PSU-level path."""
+        df = _make_strictly_coarser_psu_panel()
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(
+                n_bootstrap=200, seed=1, bootstrap_weights=weight_type
+            ).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+        assert np.isfinite(res.overall_se) and res.overall_se > 0
+
+    def test_multi_horizon_psu_bootstrap(self):
+        """Event study bootstrap inherits PSU broadcasting."""
+        df = _make_strictly_coarser_psu_panel(n_periods=6)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=300, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=1,
+            )
+        info = res.event_study_effects[1]
+        assert np.isfinite(info["effect"])
+        assert np.isfinite(info["se"]) and info["se"] > 0
+
+    def test_sup_t_under_coarser_psu(self):
+        """Sup-t critical value is computable under coarser PSU."""
+        df = _make_strictly_coarser_psu_panel(n_periods=6)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=2,
+            )
+        # sup_t_bands may be None if only one horizon is valid or if
+        # the cband critical value was not computed.
+        if res.sup_t_bands is not None:
+            assert res.sup_t_bands.get("cband_crit_value") is None or (
+                res.sup_t_bands["cband_crit_value"] > 1.0
+            )
+
+
+# ── 4. Invariants and cross-cutting contracts ────────────────────────
+
+
+class TestInvariants:
+
+    def test_replicate_plus_bootstrap_rejected(self, base_panel, replicate_design):
+        """Replicate variance + n_bootstrap > 0 raises NotImplementedError."""
+        R = 10
+        df = replicate_design(base_panel, R=R, method="BRR")
+        sd = _build_replicate_design(R, "BRR")
+        with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"):
+            ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+
+    def test_non_survey_unchanged(self, base_panel):
+        """Non-survey fits (survey_design=None) should produce bit-
+        identical SE to the pre-PSU-bootstrap behavior."""
+        r1 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+            base_panel,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+        )
+        r2 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+            base_panel,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+        )
+        assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10)
+
+    @pytest.mark.parametrize("method", ["BRR", "JK1"])
+    def test_honest_did_under_replicate(
+        self, base_panel, replicate_design, method
+    ):
+        """HonestDiD bounds should flow through replicate SE and the
+        reduced df_survey (df = min(n_valid) - 1)."""
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=1,
+                honest_did=True,
+            )
+        assert res.survey_metadata.df_survey == R - 1
+        if res.honest_did_results is not None:
+            # HonestDiD bounds exist and are finite; exact structure is
+            # honest_did.py internals. Just verify the attribute is
+            # populated so the SE flow-through is exercised.
+            assert res.honest_did_results is not None

From 53dc755e6b0d1ae63f03fe1d90597b5705ee16f4 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 08:38:12 -0400
Subject: [PATCH 02/11] Harden PSU-map propagation + update fit() survey_design
 docstring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses local AI review findings:

1. P1: `_slice_psu_map` now enforces strict length equality between
   `target_size` and the full PSU map. Under the current bootstrap
   architecture all targets (overall DID_M, joiners DID_+,
   leavers DID_-, multi-horizon DID_l, placebo DID^pl_l) use the
   same variance-eligible group ordering from `_eligible_group_ids`,
   so the old prefix-slicing was a silent no-op. Making the check
   strict converts a fragile unstated invariant into a loud
   `ValueError` if a future refactor introduces a target whose group
   subset differs from the overall ordering (e.g., an aggregated
   target built from a non-prefix mask) — preventing a silent
   miscluster bug.
2. P2: `fit()` `survey_design` docstring updated to describe:
   - Replicate-weight variance support (BRR/Fay/JK1/JKn/SDR).
   - PSU-level Hall-Mammen wild bootstrap contract.
   - Replicate + `n_bootstrap > 0` rejection.
3. P2: `twowayfeweights()` docstring updated to document replicate
   acceptance.
4. P2: Added 3 regression tests in
   `tests/test_survey_dcdh_replicate_psu.py`:
   - `test_psu_map_size_mismatch_raises`: exercises the new strict
     equality check.
   - `test_generate_psu_or_group_weights_broadcast`: direct unit test
     of PSU-level broadcast — groups in the same PSU receive the same
     multiplier within a bootstrap replicate.
   - `test_generate_psu_or_group_weights_identity`: verifies the
     identity-map fast path is bit-identical to the non-PSU path.

All 321 dCDH-related tests still pass.
---
 diff_diff/chaisemartin_dhaultfoeuille.py      | 29 ++++++--
 .../chaisemartin_dhaultfoeuille_bootstrap.py  | 38 +++++++---
 tests/test_survey_dcdh_replicate_psu.py       | 74 +++++++++++++++++++
 3 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index 9af4b6c0..d4905f30 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -604,11 +604,25 @@ def fit(
             ``drop_larger_lower=False`` to retain 2-switch groups.
         survey_design : SurveyDesign, optional
             Survey design specification for design-based inference.
-            Supports pweight with strata/PSU/FPC via Taylor Series
-            Linearization. Survey weights produce weighted cell means
-            for the point estimate; the IF-based variance accounts for
-            the survey design structure. Replicate weights are not yet
-            supported.
+            Supports ``weight_type='pweight'`` with two variance paths:
+            (1) Taylor Series Linearization using strata / PSU / FPC
+            (analytical) and (2) replicate-weight variance using
+            BRR / Fay / JK1 / JKn / SDR methods (analytical,
+            closed-form). Survey weights produce weighted cell means
+            for the point estimate. Under a survey design without an
+            explicit ``psu``, ``fit()`` auto-injects ``psu=<group_col>``
+            so the group is the effective sampling unit (strata / PSU
+            must be within-group-constant; mixed labels within a group
+            raise ``ValueError``). When ``n_bootstrap > 0`` and a
+            survey design is supplied, the multiplier bootstrap
+            operates at the PSU level (Hall-Mammen wild PSU
+            bootstrap) — under the default auto-inject this collapses
+            to a group-level clustered bootstrap. **Replicate weights
+            combined with ``n_bootstrap > 0`` are rejected** with
+            ``NotImplementedError`` (replicate variance is
+            closed-form; bootstrap would double-count variance). See
+            REGISTRY.md ``ChaisemartinDHaultfoeuille`` Notes for the
+            full contract.
 
         Returns
         -------
@@ -5412,6 +5426,11 @@ def twowayfeweights(
         (matching ``fit(..., survey_design=sd).twfe_*``). Required to preserve
         fit-vs-helper parity under survey-backed inputs. Only
         ``weight_type='pweight'`` is supported; other types raise ValueError.
+        Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are accepted —
+        the TWFE diagnostic has no SE field on ``TWFEWeightsResult``,
+        so replicate weights only affect the cell aggregation path
+        (aggregated numbers are identical to
+        ``fit(..., survey_design=sd).twfe_*`` under the same input).
 
     Returns
     -------
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
index 06eec38a..a3e7a918 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
@@ -357,20 +357,38 @@ def _slice_psu_map(
     group_to_psu_map: Optional[np.ndarray],
     target_size: int,
 ) -> Optional[np.ndarray]:
-    """Slice a full-set group-to-PSU map to a target's contributing group count.
-
-    Multi-horizon / joiner / leaver targets share the overall variance-
-    eligible group ordering but may have fewer contributing groups
-    (``u_centered.shape[0] <= n_groups_for_overall``). Slice the first
-    ``target_size`` entries — this mirrors the existing shared-weights
-    truncation at the multi-horizon bootstrap site. Returns ``None``
-    when no map is provided (plain multiplier-bootstrap path).
+    """Return a PSU map aligned to a subset bootstrap target.
+
+    The dCDH bootstrap passes a single full-length ``group_to_psu_map``
+    built in ``fit()`` from ``_eligible_group_ids`` (the variance-
+    eligible group ordering). By construction all current bootstrap
+    targets (overall, joiners, leavers, multi-horizon DID_l, placebo
+    DID^pl_l) use that same ordering — each target's IF vector has the
+    same length as the map and the entries correspond to the same
+    groups in the same order. This invariant is enforced by requiring
+    ``target_size == len(group_to_psu_map)``; if a future refactor
+    introduces a target whose group subset differs from the overall
+    variance-eligible ordering, this assertion fires loudly rather
+    than silently misclustering the bootstrap draws.
+
+    Returns ``None`` when no map is provided (plain multiplier-
+    bootstrap path — identity across targets). Raises ``ValueError``
+    when the target size does not match the map length: callers must
+    supply a target-specific map (not a truncation) if they introduce
+    non-aligned subsets.
     """
     if group_to_psu_map is None:
         return None
-    if target_size <= 0 or target_size > len(group_to_psu_map):
+    if target_size == len(group_to_psu_map):
         return group_to_psu_map
-    return group_to_psu_map[:target_size]
+    raise ValueError(
+        f"PSU map length ({len(group_to_psu_map)}) does not match "
+        f"bootstrap target size ({target_size}). dCDH's bootstrap contract "
+        f"requires all targets to use the same variance-eligible group "
+        f"ordering as `_eligible_group_ids`. If this target has a "
+        f"different ordering, construct a target-specific map keyed by "
+        f"its actual group IDs rather than truncating the full map."
+    )
 
 
 def _generate_psu_or_group_weights(
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index aa1be454..1fa9409a 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -563,6 +563,80 @@ def test_non_survey_unchanged(self, base_panel):
         )
         assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10)
 
+    def test_psu_map_size_mismatch_raises(self):
+        """`_slice_psu_map` enforces strict length equality to prevent
+        silent miscluster if a future bootstrap target uses a different
+        group ordering than `_eligible_group_ids`. Today all targets
+        align so slicing is a no-op — this guards the invariant."""
+        from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import (
+            _slice_psu_map,
+        )
+
+        full_map = np.array([0, 0, 1, 1, 2], dtype=np.int64)
+        # Exact length: no-op, returns the full map
+        assert np.array_equal(_slice_psu_map(full_map, 5), full_map)
+        # None passthrough
+        assert _slice_psu_map(None, 5) is None
+        # Mismatched length: loud failure
+        with pytest.raises(ValueError, match="PSU map length"):
+            _slice_psu_map(full_map, 3)
+
+    def test_generate_psu_or_group_weights_broadcast(self):
+        """Direct unit test of the PSU-level weight generator:
+        groups mapped to the same PSU receive the same multiplier
+        within a single bootstrap replicate (Hall-Mammen wild PSU
+        contract)."""
+        from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import (
+            _generate_psu_or_group_weights,
+        )
+
+        # 6 groups → 3 PSUs: groups (0,1), (2,3), (4,5) share multipliers.
+        group_to_psu = np.array([0, 0, 1, 1, 2, 2], dtype=np.int64)
+        rng = np.random.default_rng(0)
+        W = _generate_psu_or_group_weights(
+            n_bootstrap=50,
+            n_groups_target=6,
+            weight_type="rademacher",
+            rng=rng,
+            group_to_psu_map=group_to_psu,
+        )
+        assert W.shape == (50, 6)
+        # Groups in the same PSU must receive the same multiplier
+        # within each bootstrap replicate.
+        assert np.array_equal(W[:, 0], W[:, 1])
+        assert np.array_equal(W[:, 2], W[:, 3])
+        assert np.array_equal(W[:, 4], W[:, 5])
+        # Different PSUs should NOT all produce identical columns
+        # (given 50 replicates and Rademacher weights, collision
+        # probability is 2^-49 → effectively 0).
+        assert not np.array_equal(W[:, 0], W[:, 2])
+
+    def test_generate_psu_or_group_weights_identity(self):
+        """Identity map (each group its own PSU) uses the fast path and
+        produces group-level weights bit-identical to the non-PSU path."""
+        from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import (
+            _generate_psu_or_group_weights,
+        )
+
+        identity_map = np.arange(6, dtype=np.int64)
+        rng1 = np.random.default_rng(0)
+        W_identity = _generate_psu_or_group_weights(
+            n_bootstrap=20,
+            n_groups_target=6,
+            weight_type="mammen",
+            rng=rng1,
+            group_to_psu_map=identity_map,
+        )
+        rng2 = np.random.default_rng(0)
+        W_plain = _generate_psu_or_group_weights(
+            n_bootstrap=20,
+            n_groups_target=6,
+            weight_type="mammen",
+            rng=rng2,
+            group_to_psu_map=None,
+        )
+        assert np.array_equal(W_identity, W_plain)
+
     @pytest.mark.parametrize("method", ["BRR", "JK1"])
     def test_honest_did_under_replicate(
         self, base_panel, replicate_design, method

From 2f8ce4c8630782c45dd2d2ef8d10dfb219bcee46 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 08:44:27 -0400
Subject: [PATCH 03/11] Build PSU map from group IDs, not shared positional
 array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses the AI review R2 finding that the prior shared-map +
length-assertion approach still relied on an unstated invariant
(all bootstrap targets use the variance-eligible group ordering).

Changes:
- `_compute_dcdh_bootstrap` now accepts `group_id_to_psu_code: Dict[Any, int]`
  and `eligible_group_ids: np.ndarray` instead of a pre-built flat
  `group_to_psu_map` array. Every target call constructs its PSU
  map via `_map_for_target(target_size, group_id_to_psu_code,
  eligible_group_ids)` — a dict lookup per group ID, not a positional
  reuse. The length-assertion is retained for the size-mismatch
  case and now sits alongside a KeyError-based check for missing
  group IDs.
- `_slice_psu_map` removed; `_map_for_target` replaces it. The
  explicit per-target construction removes the hidden semantic
  assumption that was previously enforced only by cardinality.
- `fit()` builds the dict from `_eligible_group_ids` + the dense PSU
  codes. The per-target maps are all identical today (because every
  current target uses `_eligible_group_ids` ordering), but that is
  now a consequence of the caller's choice rather than a brittle
  assumption buried in the mixin.
- Test `test_psu_map_size_mismatch_raises` replaced by
  `test_map_for_target_id_lookup`, which exercises the ID-lookup
  semantics including:
  - Correct map construction for a given eligible-group ordering.
  - Non-prefix reordering produces a different map (proving the
    ordering-from-IDs contract).
  - Length mismatch → ValueError.
  - Missing-group → ValueError.

Full regression: 401 passing (previously 321).
---
 diff_diff/chaisemartin_dhaultfoeuille.py      |  39 +++---
 .../chaisemartin_dhaultfoeuille_bootstrap.py  | 117 ++++++++++++------
 tests/test_survey_dcdh_replicate_psu.py       |  49 +++++---
 3 files changed, 134 insertions(+), 71 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index d4905f30..54890d35 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -2114,16 +2114,17 @@ def fit(
                         h_data["did_l"],
                     )
 
-            # Under a survey design with PSU information, build a dense
-            # group_to_psu_map so the bootstrap randomizes at the PSU
-            # level rather than at the group level (Hall-Mammen wild PSU
-            # bootstrap). PSU/strata are within-group-constant by the
-            # _validate_group_constant_strata_psu precondition, so each
-            # variance-eligible group has exactly one PSU label. Under
-            # auto-inject psu=group the map is the identity and the
-            # bootstrap mixin's fast path reproduces the pre-PSU
-            # behavior bit-for-bit.
-            group_to_psu_map_bootstrap: Optional[np.ndarray] = None
+            # Under a survey design with PSU information, build a
+            # `group_id_to_psu_code` dict so the bootstrap mixin can
+            # derive each target's PSU map by ID lookup rather than by
+            # positional reuse. PSU/strata are within-group-constant by
+            # the _validate_group_constant_strata_psu precondition, so
+            # each variance-eligible group has exactly one PSU label.
+            # Under auto-inject psu=group each group has a unique PSU
+            # code and the bootstrap mixin's identity-map fast path
+            # reproduces the pre-PSU behavior bit-for-bit.
+            group_id_to_psu_code_bootstrap: Optional[Dict[Any, int]] = None
+            eligible_group_ids_bootstrap: Optional[np.ndarray] = None
             if (
                 resolved_survey is not None
                 and getattr(resolved_survey, "psu", None) is not None
@@ -2144,17 +2145,20 @@ def fit(
                         break
                     labels = obs_psu_codes[mask_g]
                     # Within-group-constant PSU is validated upstream;
-                    # .item() on the first is robust to integer vs string.
+                    # the first label represents the whole group.
                     group_psu_labels.append(labels[0])
                 if valid_map and len(group_psu_labels) == n_groups_for_overall_var:
-                    # Dense integer indices for _generate_psu_or_group_weights.
-                    _, group_to_psu_map_bootstrap = np.unique(
+                    # Factor PSU labels to dense integer codes.
+                    _, dense_codes = np.unique(
                         np.asarray(group_psu_labels),
                         return_inverse=True,
                     )
-                    group_to_psu_map_bootstrap = np.asarray(
-                        group_to_psu_map_bootstrap, dtype=np.int64
-                    )
+                    dense_codes = np.asarray(dense_codes, dtype=np.int64)
+                    group_id_to_psu_code_bootstrap = {
+                        gid: int(code)
+                        for gid, code in zip(_eligible_group_ids, dense_codes)
+                    }
+                    eligible_group_ids_bootstrap = np.asarray(_eligible_group_ids)
 
             br = self._compute_dcdh_bootstrap(
                 n_groups_for_overall=n_groups_for_overall_var,
@@ -2166,7 +2170,8 @@ def fit(
                 placebo_inputs=placebo_inputs,
                 multi_horizon_inputs=mh_boot_inputs,
                 placebo_horizon_inputs=pl_boot_inputs,
-                group_to_psu_map=group_to_psu_map_bootstrap,
+                group_id_to_psu_code=group_id_to_psu_code_bootstrap,
+                eligible_group_ids=eligible_group_ids_bootstrap,
             )
             bootstrap_results = br
 
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
index a3e7a918..a85ae446 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
@@ -19,7 +19,7 @@
 produce a bootstrap distribution per target.
 """
 
-from typing import TYPE_CHECKING, Dict, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
 import numpy as np
 
@@ -71,7 +71,8 @@ def _compute_dcdh_bootstrap(
         multi_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None,
         placebo_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None,
         # --- Survey: PSU-level bootstrap under survey designs ---
-        group_to_psu_map: Optional[np.ndarray] = None,
+        group_id_to_psu_code: Optional[Dict[Any, int]] = None,
+        eligible_group_ids: Optional[np.ndarray] = None,
     ) -> DCDHBootstrapResults:
         """
         Compute multiplier-bootstrap inference for all dCDH targets.
@@ -162,6 +163,14 @@ def _compute_dcdh_bootstrap(
             )
         rng = np.random.default_rng(self.seed)
 
+        # PSU label for each bootstrap weight column is derived from
+        # the group's ID via `_map_for_target`, not from positional
+        # truncation. All current dCDH bootstrap targets use the
+        # variance-eligible group ordering (`eligible_group_ids`); if a
+        # future target uses a different ordering, add a dedicated
+        # group-IDs parameter for it rather than reusing the overall
+        # eligible list.
+
         # --- Overall DID_M ---
         # Skip the scalar DID_M bootstrap when divisor_overall <= 0
         # (e.g., pure non-binary panels where N_S=0), but continue
@@ -177,7 +186,11 @@ def _compute_dcdh_bootstrap(
                 rng=rng,
                 context="dCDH overall DID_M bootstrap",
                 return_distribution=True,
-                group_to_psu_map=group_to_psu_map,
+                group_to_psu_map=_map_for_target(
+                    u_centered_overall.shape[0],
+                    group_id_to_psu_code,
+                    eligible_group_ids,
+                ),
             )
         else:
             overall_se = np.nan
@@ -209,7 +222,9 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH joiners DID_+ bootstrap",
                     return_distribution=False,
-                    group_to_psu_map=_slice_psu_map(group_to_psu_map, u_j.size),
+                    group_to_psu_map=_map_for_target(
+                        u_j.size, group_id_to_psu_code, eligible_group_ids,
+                    ),
                 )
                 results.joiners_se = se_j
                 results.joiners_ci = ci_j
@@ -229,7 +244,9 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH leavers DID_- bootstrap",
                     return_distribution=False,
-                    group_to_psu_map=_slice_psu_map(group_to_psu_map, u_l.size),
+                    group_to_psu_map=_map_for_target(
+                        u_l.size, group_id_to_psu_code, eligible_group_ids,
+                    ),
                 )
                 results.leavers_se = se_l
                 results.leavers_ci = ci_l
@@ -249,7 +266,9 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH placebo DID_M^pl bootstrap",
                     return_distribution=False,
-                    group_to_psu_map=_slice_psu_map(group_to_psu_map, u_pl.size),
+                    group_to_psu_map=_map_for_target(
+                        u_pl.size, group_id_to_psu_code, eligible_group_ids,
+                    ),
                 )
                 results.placebo_se = se_pl
                 results.placebo_ci = ci_pl
@@ -276,7 +295,9 @@ def _compute_dcdh_bootstrap(
                 n_groups_target=n_groups_mh,
                 weight_type=self.bootstrap_weights,
                 rng=rng,
-                group_to_psu_map=group_to_psu_map,
+                group_to_psu_map=_map_for_target(
+                    n_groups_mh, group_id_to_psu_code, eligible_group_ids,
+                ),
             )
 
             for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()):
@@ -335,7 +356,9 @@ def _compute_dcdh_bootstrap(
                         rng=rng,
                         context=f"dCDH placebo l={l_h} bootstrap",
                         return_distribution=False,
-                        group_to_psu_map=_slice_psu_map(group_to_psu_map, u_h.size),
+                        group_to_psu_map=_map_for_target(
+                            u_h.size, group_id_to_psu_code, eligible_group_ids,
+                        ),
                     )
                     pl_ses[l_h] = se_h
                     pl_cis[l_h] = ci_h
@@ -353,42 +376,56 @@ def _compute_dcdh_bootstrap(
 # =============================================================================
 
 
-def _slice_psu_map(
-    group_to_psu_map: Optional[np.ndarray],
+def _map_for_target(
     target_size: int,
+    group_id_to_psu_code: Optional[Dict[Any, int]],
+    eligible_group_ids: Optional[np.ndarray],
 ) -> Optional[np.ndarray]:
-    """Return a PSU map aligned to a subset bootstrap target.
-
-    The dCDH bootstrap passes a single full-length ``group_to_psu_map``
-    built in ``fit()`` from ``_eligible_group_ids`` (the variance-
-    eligible group ordering). By construction all current bootstrap
-    targets (overall, joiners, leavers, multi-horizon DID_l, placebo
-    DID^pl_l) use that same ordering — each target's IF vector has the
-    same length as the map and the entries correspond to the same
-    groups in the same order. This invariant is enforced by requiring
-    ``target_size == len(group_to_psu_map)``; if a future refactor
-    introduces a target whose group subset differs from the overall
-    variance-eligible ordering, this assertion fires loudly rather
-    than silently misclustering the bootstrap draws.
-
-    Returns ``None`` when no map is provided (plain multiplier-
-    bootstrap path — identity across targets). Raises ``ValueError``
-    when the target size does not match the map length: callers must
-    supply a target-specific map (not a truncation) if they introduce
-    non-aligned subsets.
+    """Build a PSU map for a bootstrap target from IDs (not positions).
+
+    The caller passes:
+    - ``group_id_to_psu_code``: a dict mapping each variance-eligible
+      group ID to its dense PSU code (built once in ``fit()``).
+    - ``eligible_group_ids``: the ordered list of group IDs that
+      correspond to the current target's ``u_centered`` vector.
+
+    Returns an integer array of length ``target_size`` where entry
+    ``i`` is the PSU code for the ``i``-th contributing group.
+
+    Returns ``None`` when no PSU information is available (plain
+    multiplier-bootstrap path — identity across targets).
+
+    Raises ``ValueError`` if ``target_size`` does not match
+    ``len(eligible_group_ids)``: every current dCDH bootstrap target
+    uses the variance-eligible group ordering, so any size mismatch
+    signals that a caller introduced a target whose group subset
+    diverges and should pass its own ``target_group_ids`` rather than
+    reusing the overall eligible list. Also raises ``ValueError`` if
+    any group ID is missing from the dict (signaling misalignment
+    between the target's IF vector and the map's keys).
     """
-    if group_to_psu_map is None:
+    if group_id_to_psu_code is None or eligible_group_ids is None:
         return None
-    if target_size == len(group_to_psu_map):
-        return group_to_psu_map
-    raise ValueError(
-        f"PSU map length ({len(group_to_psu_map)}) does not match "
-        f"bootstrap target size ({target_size}). dCDH's bootstrap contract "
-        f"requires all targets to use the same variance-eligible group "
-        f"ordering as `_eligible_group_ids`. If this target has a "
-        f"different ordering, construct a target-specific map keyed by "
-        f"its actual group IDs rather than truncating the full map."
-    )
+    if target_size != len(eligible_group_ids):
+        raise ValueError(
+            f"Bootstrap target size ({target_size}) does not match "
+            f"eligible_group_ids length ({len(eligible_group_ids)}). "
+            "dCDH's bootstrap contract requires all current targets to "
+            "use the variance-eligible group ordering; if a new target "
+            "has a different ordering, pass target-specific group IDs "
+            "to _map_for_target rather than reusing eligible_group_ids."
+        )
+    try:
+        return np.array(
+            [group_id_to_psu_code[gid] for gid in eligible_group_ids],
+            dtype=np.int64,
+        )
+    except KeyError as e:
+        raise ValueError(
+            f"Group ID {e.args[0]!r} in eligible_group_ids has no entry "
+            f"in group_id_to_psu_code — PSU map is misaligned with the "
+            f"bootstrap target's group set."
+        ) from e
 
 
 def _generate_psu_or_group_weights(
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index 1fa9409a..836583d2 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -563,23 +563,44 @@ def test_non_survey_unchanged(self, base_panel):
         )
         assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10)
 
-    def test_psu_map_size_mismatch_raises(self):
-        """`_slice_psu_map` enforces strict length equality to prevent
-        silent miscluster if a future bootstrap target uses a different
-        group ordering than `_eligible_group_ids`. Today all targets
-        align so slicing is a no-op — this guards the invariant."""
+    def test_map_for_target_id_lookup(self):
+        """`_map_for_target` builds the PSU map from group IDs via dict
+        lookup, not positional reuse. All current dCDH bootstrap
+        targets use the variance-eligible group ordering, so the
+        helper looks up each target group's PSU via
+        `group_id_to_psu_code`. Length mismatch → ValueError (loud
+        failure rather than silent miscluster)."""
         from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import (
-            _slice_psu_map,
+            _map_for_target,
         )
 
-        full_map = np.array([0, 0, 1, 1, 2], dtype=np.int64)
-        # Exact length: no-op, returns the full map
-        assert np.array_equal(_slice_psu_map(full_map, 5), full_map)
-        # None passthrough
-        assert _slice_psu_map(None, 5) is None
-        # Mismatched length: loud failure
-        with pytest.raises(ValueError, match="PSU map length"):
-            _slice_psu_map(full_map, 3)
+        gid_to_psu = {"a": 0, "b": 0, "c": 1, "d": 1, "e": 2}
+        eligible = np.asarray(["a", "b", "c", "d", "e"])
+        expected = np.array([0, 0, 1, 1, 2], dtype=np.int64)
+        built = _map_for_target(5, gid_to_psu, eligible)
+        assert built is not None
+        assert np.array_equal(built, expected)
+
+        # None passthrough when no PSU info
+        assert _map_for_target(5, None, None) is None
+
+        # Length mismatch → loud failure
+        with pytest.raises(ValueError, match="target size"):
+            _map_for_target(3, gid_to_psu, eligible)
+
+        # Missing group ID → loud failure
+        gid_to_psu_incomplete = {"a": 0, "b": 0, "c": 1, "d": 1}
+        with pytest.raises(ValueError, match="has no entry"):
+            _map_for_target(5, gid_to_psu_incomplete, eligible)
+
+        # Non-prefix reordering: different ordered group IDs produce a
+        # different PSU map even if the dict is the same. This is the
+        # key invariant the previous prefix-slicing lacked.
+        eligible_reordered = np.asarray(["c", "a", "d", "e", "b"])
+        built_reordered = _map_for_target(5, gid_to_psu, eligible_reordered)
+        assert built_reordered is not None
+        expected_reordered = np.array([1, 0, 1, 2, 0], dtype=np.int64)
+        assert np.array_equal(built_reordered, expected_reordered)
 
     def test_generate_psu_or_group_weights_broadcast(self):
         """Direct unit test of the PSU-level weight generator:

From 84c9895b58a367b06187ddfb0ffa5a09ec514294 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 08:46:57 -0400
Subject: [PATCH 04/11] Enforce explicit multi-horizon alignment in shared-draw
 bootstrap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AI review R3 flagged the remaining positional truncation in the
multi-horizon shared-draw path:

    w_h = shared_weights[:, : u_h.size]

Under the current contract every horizon's IF vector uses the full
variance-eligible group ordering, so `u_h.size == n_groups_mh` always
holds and the slice is a no-op. But the truncation itself is a hidden
positional assumption — if a future refactor introduced horizon-
specific masking, the shared-weight alignment would break silently.

Replace the truncation with:
1. An explicit assertion `u_h.size == n_groups_mh` that raises with a
   message pointing future refactors at the right fix (threading
   target-specific group IDs through `multi_horizon_inputs`).
2. Direct reuse of the full `shared_weights` matrix — removes the
   slice semantic entirely.

Add `TestPSUBootstrap.test_multi_horizon_shared_draw_under_coarser_psu`
covering an end-to-end L_max=2 + coarser PSU case that exercises the
shared-draw path + assertion.

Full regression: 312 passing.
---
 .../chaisemartin_dhaultfoeuille_bootstrap.py  | 25 ++++++++++++++++--
 tests/test_survey_dcdh_replicate_psu.py       | 26 +++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
index a85ae446..a456890e 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
@@ -302,8 +302,29 @@ def _compute_dcdh_bootstrap(
 
             for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()):
                 if u_h.size > 0 and n_h > 0:
-                    # Use the shared weight matrix truncated to u_h length
-                    w_h = shared_weights[:, : u_h.size]
+                    # Under the current contract every horizon's IF
+                    # vector uses the variance-eligible group ordering
+                    # from `eligible_group_ids`, so the shared weight
+                    # matrix is already at the right shape. Assert
+                    # this invariant so any future refactor that
+                    # introduces horizon-specific masking fails loudly
+                    # rather than silently misaligning PSU clusters via
+                    # positional truncation.
+                    if u_h.size != n_groups_mh:
+                        raise ValueError(
+                            f"Multi-horizon bootstrap: horizon {l_h} "
+                            f"IF vector has {u_h.size} entries but "
+                            f"shared weight matrix has {n_groups_mh} "
+                            f"columns. dCDH's contract requires every "
+                            f"horizon to use the variance-eligible "
+                            f"group ordering; to support a horizon "
+                            f"with a different ordering, thread "
+                            f"target-specific group IDs through "
+                            f"`multi_horizon_inputs` and project the "
+                            f"shared PSU draws onto the horizon's own "
+                            f"ordering via `_map_for_target`."
+                        )
+                    w_h = shared_weights
                     deviations = (w_h @ u_h) / n_h
                     dist_h = deviations + eff_h
 
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index 836583d2..3838dd16 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -501,6 +501,32 @@ def test_multi_horizon_psu_bootstrap(self):
         assert np.isfinite(info["effect"])
         assert np.isfinite(info["se"]) and info["se"] > 0
 
+    def test_multi_horizon_shared_draw_under_coarser_psu(self):
+        """End-to-end test of the shared-draw multi-horizon bootstrap
+        under a strictly coarser PSU with L_max >= 2. Exercises the
+        per-horizon invariant assertion + sup-t band computation
+        through the Hall-Mammen wild PSU path."""
+        df = _make_strictly_coarser_psu_panel(n_periods=7)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=2,
+            )
+        # Both horizons should produce finite bootstrap SEs via the
+        # shared-draw path projected through the PSU map.
+        for lag in (1, 2):
+            info = res.event_study_effects[lag]
+            if info["n_obs"] > 0:
+                assert np.isfinite(info["effect"])
+                assert np.isfinite(info["se"]) and info["se"] > 0
+
     def test_sup_t_under_coarser_psu(self):
         """Sup-t critical value is computable under coarser PSU."""
         df = _make_strictly_coarser_psu_panel(n_periods=6)

From 13d5e41f646d39ef2ad9b6c58c3a9542977f8ef9 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 09:24:30 -0400
Subject: [PATCH 05/11] Fix CI review R1: reduce (not replace) design df under
 replicate variance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses R1 P1 from PR #311 AI review.

The R1 reviewer flagged that `_effective_df_survey()` was overwriting
the design-level `resolved_survey.df_survey` with `min(n_valid) - 1`
whenever any replicate-aware IF site ran. For replicate designs with
rank deficiency, `ResolvedSurveyDesign.df_survey` returns
`QR-rank - 1` (per R's `survey::degf()` convention at
`diff_diff/survey.py:590`), which can be strictly smaller than
`R - 1`. The old behavior produced anti-conservative t-critical
values on the top-level dCDH surface.

Additionally, the final effective df was never persisted into
`survey_metadata.df_survey`, so HonestDiD (which reads from
`results.survey_metadata.df_survey` at `honest_did.py:973`) could
use a different df than the main dCDH surface.

Changes:
- `_effective_df_survey()` now returns `min(resolved_survey.df_survey,
  min(n_valid) - 1)` when both are defined; returns None if base df
  is undefined (rank ≤ 1) or reduced df is degenerate (< 1).
- In `fit()`, immediately before constructing
  `ChaisemartinDHaultfoeuilleResults`, mutate
  `survey_metadata.df_survey` to the final effective df (post-all-
  IF-sites, including heterogeneity). `SurveyMetadata` is a mutable
  `@dataclass` so direct assignment is safe.
- `_compute_heterogeneity_test()` docstring updated to describe the
  replicate-weight dispatch and effective-df rule (R1 P3).

Regression tests in `TestInvariants`:
- `test_rank_deficient_replicate_uses_design_df`: duplicated replicate
  columns → `design_df < R - 1` → persisted `survey_metadata.df_survey`
  must equal `design_df` (not `R - 1`).
- `test_dropped_replicate_reduces_df`: zeroed replicate columns
  produce rank deficiency in the design → persisted df must be
  bounded by `design_df`.

Full regression: 324 passing.
---
 diff_diff/chaisemartin_dhaultfoeuille.py | 63 ++++++++++++++---
 tests/test_survey_dcdh_replicate_psu.py  | 87 ++++++++++++++++++++++++
 2 files changed, 139 insertions(+), 11 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index 54890d35..d72f543a 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -2683,6 +2683,22 @@ def fit(
             effective_joiners_available = False
             effective_leavers_available = False
 
+        # Persist the final effective df_survey (post-heterogeneity,
+        # post-all-IF-sites) into survey_metadata so downstream
+        # consumers — HonestDiD bounds (honest_did.py:973 reads
+        # results.survey_metadata.df_survey), exported metadata, and
+        # users — all see the same df that the top-level dCDH inference
+        # actually used. Before this assignment, survey_metadata carries
+        # the design-level `resolved_survey.df_survey` from survey
+        # resolution; under replicate designs the effective df may be
+        # smaller (reduced via _replicate_n_valid_list). SurveyMetadata
+        # is a mutable @dataclass (diff_diff/survey.py:681), so direct
+        # attribute assignment is safe.
+        if survey_metadata is not None:
+            survey_metadata.df_survey = _effective_df_survey(
+                resolved_survey, _replicate_n_valid_list
+            )
+
         results = ChaisemartinDHaultfoeuilleResults(
             overall_att=effective_overall_att,
             overall_se=effective_overall_se,
@@ -3439,8 +3455,21 @@ def _compute_heterogeneity_test(
         Observation-level survey info with keys ``group_ids`` (raw per-row
         group labels), ``weights`` (per-row survey weights), and ``resolved``
         (ResolvedSurveyDesign). When provided, the regression uses WLS with
-        per-group weights W_g = sum of obs survey weights, and SE is computed
-        via Binder TSL IF expansion through ``compute_survey_if_variance``.
+        per-group weights W_g = sum of obs survey weights. SE is computed
+        via Binder TSL IF expansion through ``compute_survey_if_variance``
+        by default; under a replicate-weight design (BRR/Fay/JK1/JKn/SDR),
+        dispatches to ``compute_replicate_if_variance`` for Rao-Wu-style
+        variance. The effective df for t-critical values follows the
+        site-level ``min(df_s, n_valid_het - 1)`` rule and the helper
+        mutates ``replicate_n_valid_list`` so the final
+        ``_effective_df_survey(...)`` sees this site's n_valid.
+    replicate_n_valid_list : list[int], optional
+        Shared accumulator for replicate-weight ``n_valid`` counts across
+        IF sites. When provided and a replicate design is in use, this
+        function appends its own ``n_valid_het`` before computing the
+        local effective df — so both the local inference fields and the
+        final ``survey_metadata.df_survey`` (set by ``fit()``) reflect
+        this site's contribution.
 
     Returns
     -------
@@ -4985,18 +5014,30 @@ def _effective_df_survey(
 
     Under replicate-weight designs, each IF site returns an ``n_valid``
     count (number of replicate columns that produced a finite estimate).
-    This helper returns ``min(n_valid) - 1`` across sites, matching the
-    precedent in ``diff_diff/efficient_did.py:1133-1135`` and
-    ``diff_diff/triple_diff.py:676-686``. Under TSL (analytical) or
-    non-survey fits, ``replicate_n_valid_list`` is empty and this falls
-    through to ``resolved_survey.df_survey`` (or ``None`` when there's
-    no survey design at all).
+    This helper **reduces** — never overwrites — the design-level
+    ``resolved_survey.df_survey`` (which for replicate designs is
+    ``QR-rank - 1`` per R's ``survey::degf()`` convention; see
+    ``diff_diff/survey.py:590``). Returns
+    ``min(resolved_survey.df_survey, min(n_valid) - 1)`` when both are
+    defined.
+
+    Matches the precedent in ``diff_diff/efficient_did.py:1133-1135``
+    and ``diff_diff/triple_diff.py:676-686`` (both reduce, not
+    replace). Under TSL (analytical) or non-survey fits,
+    ``replicate_n_valid_list`` is empty and this falls through to the
+    design-level df. Returns ``None`` when either the base df is
+    undefined (rank ≤ 1) or the reduced df would be < 1 — preserving
+    the NaN-inference contract from ``safe_inference``.
     """
     if resolved_survey is None:
         return None
-    if replicate_n_valid_list:
-        return int(min(replicate_n_valid_list)) - 1
-    return resolved_survey.df_survey
+    base_df = resolved_survey.df_survey
+    if not replicate_n_valid_list:
+        return base_df
+    reduced_df = int(min(replicate_n_valid_list)) - 1
+    if base_df is None or reduced_df < 1:
+        return None
+    return min(int(base_df), reduced_df)
 
 
 def _compute_se(
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index 3838dd16..c07d2cd9 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -711,3 +711,90 @@ def test_honest_did_under_replicate(
             # honest_did.py internals. Just verify the attribute is
             # populated so the SE flow-through is exercised.
             assert res.honest_did_results is not None
+
+    def test_rank_deficient_replicate_uses_design_df(self, base_panel):
+        """Duplicated replicate columns → QR-rank < R → design df < R-1.
+
+        Regression for PR #311 CI review R1 P1. The main dCDH surface,
+        ``survey_metadata.df_survey``, and HonestDiD must all use the
+        reduced effective df (``min(design_df, n_valid - 1)``) rather
+        than the naive ``n_valid - 1`` that ignored the design's QR
+        rank. Before the fix, ``survey_metadata.df_survey`` stayed at
+        ``R - 1`` for rank-deficient replicate designs — which is
+        anti-conservative for t-inference.
+        """
+        R = 12
+        df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3)
+        # Force duplicate columns to induce rank deficiency:
+        # rep1 = rep0, rep3 = rep2 → true rank = R - 2.
+        df["rep1"] = df["rep0"]
+        df["rep3"] = df["rep2"]
+        sd = _build_replicate_design(R, "BRR")
+        # Read off the design-level df via the resolved design so the
+        # test is robust to internal QR tolerance changes.
+        resolved = sd.resolve(df)
+        design_df = resolved.df_survey
+        assert design_df is not None and design_df < R - 1, (
+            f"Expected rank deficiency: design df={design_df} "
+            f"should be < {R - 1}"
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1, honest_did=True,
+            )
+        # survey_metadata carries the final (reduced) df — not R - 1.
+        assert res.survey_metadata.df_survey == design_df, (
+            f"Expected survey_metadata.df_survey == design_df "
+            f"({design_df}), got {res.survey_metadata.df_survey}."
+        )
+        # HonestDiD reads results.survey_metadata.df_survey directly
+        # (honest_did.py:973), so the same reduced df flows through.
+        # We assert the HonestDiD result is populated (df flow-through
+        # is exercised by the honest_did=True call above).
+        assert res.honest_did_results is not None
+
+    def test_dropped_replicate_reduces_df(self, base_panel):
+        """When some replicate columns produce degenerate per-replicate
+        estimates (e.g., all-zero weight vectors), the design's QR rank
+        drops below R — which flows through to the effective df.
+
+        Regression for PR #311 CI review R1 P2. Even when every IF site
+        reports the full ``n_valid = R``, the persisted df must never
+        exceed the design-level df. Pre-fix, ``_effective_df_survey``
+        returned ``n_valid - 1 = R - 1`` which over-counted degrees of
+        freedom on a reduced-rank design.
+        """
+        R = 15
+        df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=4)
+        # Zero out two replicate columns → drops rank by 2.
+        df["rep0"] = 0.0
+        df["rep1"] = 0.0
+        sd = _build_replicate_design(R, "JK1")
+        resolved = sd.resolve(df)
+        design_df = resolved.df_survey
+        assert design_df is not None and design_df <= R - 3, (
+            f"Expected zero-column rank deficiency: design df={design_df} "
+            f"should be <= {R - 3}"
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        # Persisted df must be capped by the design df, not R - 1.
+        assert res.survey_metadata.df_survey is not None
+        assert res.survey_metadata.df_survey <= design_df, (
+            f"Expected survey_metadata.df_survey <= design_df "
+            f"({design_df}), got {res.survey_metadata.df_survey}. "
+            f"Effective df must NEVER exceed the design-level df under "
+            f"replicate variance."
+        )
+        assert res.survey_metadata.df_survey < R - 1, (
+            f"Expected persisted df to reflect reduced rank (< R - 1 = "
+            f"{R - 1}), got {res.survey_metadata.df_survey}."
+        )

From 7e62d4cf479d3c4aede4dc595791d2b29fa7b8b1 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 10:09:49 -0400
Subject: [PATCH 06/11] Fix CI review R2: undefined-df NaN inference +
 cross-surface df consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses R2 P0 + P1 from PR #311 AI review.

R2 P0: `_effective_df_survey` returns None when replicate design has
undefined base df (QR-rank ≤ 1), but `safe_inference(df=None)` uses
z-inference — NOT NaN. So a rank-1 replicate design with finite SE
silently produced valid-looking p-values/CIs where REGISTRY.md
mandates NaN.

R2 P1: `_compute_heterogeneity_test` re-derived its local df_s from
`min(replicate_n_valid_list) - 1` directly — bypassing the base-df
cap the rest of dCDH uses. When df_s was None, the in-loop fallback
promoted it to `n_valid_het - 1`, which rescued a rank-deficient
design in heterogeneity inference that the main surface correctly
treated as NaN. Separately, heterogeneity appended its own n_valid
AFTER main inference had frozen, leaving `overall_t`, `event_study
_effects`, etc. computed with a larger-than-final df while HonestDiD
(reading the post-heterogeneity `survey_metadata.df_survey`) used the
smaller final df.

Changes:
- Add `_inference_df(effective_df, resolved_survey)` helper next to
  `_effective_df_survey`: coerces None to 0 when replicate (triggers
  safe_inference's NaN branch), else returns effective_df as-is.
- Wrap every dCDH `safe_inference(df=...)` call site (~14 sites) with
  `_inference_df(df, resolved_survey)`.
- In `_compute_heterogeneity_test`, derive `df_s` via
  `_effective_df_survey(resolved, list(replicate_n_valid_list or []))`
  instead of the ad-hoc `min - 1`. When `df_s is None` (undefined base
  df), keep `df_s_local = None` — never promote.
- Add a post-heterogeneity finalization block in `fit()` that re-runs
  `safe_inference` for every public surface (overall, joiners,
  leavers, multi-horizon, placebo-horizon, heterogeneity) with the
  FINAL effective df. This guarantees every surface and
  `survey_metadata.df_survey` use the same df.
- Update REGISTRY.md heterogeneity Note to document replicate-weight
  dispatch and the effective-df rule.

Regression tests (TestInvariants):
- `test_rank_1_replicate_forces_nan_inference`: identical replicate
  columns → rank-1 design → design df None → all inference fields NaN.
- `test_heterogeneity_replicate_cross_surface_df_consistency`:
  rank-deficient replicate design with `heterogeneity=` active —
  asserts `overall_p_value` matches `t(final_df)` distribution and
  heterogeneity fields agree.

Full regression: 326 passing.
---
 diff_diff/chaisemartin_dhaultfoeuille.py | 184 +++++++++++++++++------
 docs/methodology/REGISTRY.md             |   2 +-
 tests/test_survey_dcdh_replicate_psu.py  | 105 +++++++++++++
 3 files changed, 248 insertions(+), 43 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index d72f543a..d4a6fb15 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -1627,7 +1627,7 @@ def fit(
 
                 did_l_val = multi_horizon_dids[l_h]["did_l"]
                 _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
-                t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_df_s)
+                t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_inference_df(_df_s, resolved_survey))
                 multi_horizon_inference[l_h] = {
                     "effect": did_l_val,
                     "se": se_l,
@@ -1756,7 +1756,8 @@ def fit(
                     pl_val = pl_data["placebo_l"]
                     _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
                     t_pl_l, p_pl_l, ci_pl_l = safe_inference(
-                        pl_val, se_pl_l, alpha=self.alpha, df=_df_s
+                        pl_val, se_pl_l, alpha=self.alpha,
+                        df=_inference_df(_df_s, resolved_survey),
                     )
                     placebo_horizon_inference[lag_l] = {
                         "effect": pl_val,
@@ -1863,7 +1864,8 @@ def fit(
             )
         _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
         overall_t, overall_p, overall_ci = safe_inference(
-            overall_att, overall_se, alpha=self.alpha, df=_df_survey
+            overall_att, overall_se, alpha=self.alpha,
+            df=_inference_df(_df_survey, resolved_survey),
         )
 
         # Joiners SE (uses joiner-only centered IF; conservative bound)
@@ -1878,7 +1880,8 @@ def fit(
                 _replicate_n_valid_list.append(n_valid_joiners)
             _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
             joiners_t, joiners_p, joiners_ci = safe_inference(
-                joiners_att, joiners_se, alpha=self.alpha, df=_df_survey
+                joiners_att, joiners_se, alpha=self.alpha,
+                df=_inference_df(_df_survey, resolved_survey),
             )
         else:
             joiners_se, joiners_t, joiners_p, joiners_ci = (
@@ -1900,7 +1903,8 @@ def fit(
                 _replicate_n_valid_list.append(n_valid_leavers)
             _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
             leavers_t, leavers_p, leavers_ci = safe_inference(
-                leavers_att, leavers_se, alpha=self.alpha, df=_df_survey
+                leavers_att, leavers_se, alpha=self.alpha,
+                df=_inference_df(_df_survey, resolved_survey),
             )
         else:
             leavers_se, leavers_t, leavers_p, leavers_ci = (
@@ -2200,17 +2204,17 @@ def fit(
                 overall_se = br.overall_se
                 overall_p = br.overall_p_value if br.overall_p_value is not None else np.nan
                 overall_ci = br.overall_ci if br.overall_ci is not None else (np.nan, np.nan)
-                overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_df_survey)[0]
+                overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0]
             if joiners_available and br.joiners_se is not None and np.isfinite(br.joiners_se):
                 joiners_se = br.joiners_se
                 joiners_p = br.joiners_p_value if br.joiners_p_value is not None else np.nan
                 joiners_ci = br.joiners_ci if br.joiners_ci is not None else (np.nan, np.nan)
-                joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_df_survey)[0]
+                joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0]
             if leavers_available and br.leavers_se is not None and np.isfinite(br.leavers_se):
                 leavers_se = br.leavers_se
                 leavers_p = br.leavers_p_value if br.leavers_p_value is not None else np.nan
                 leavers_ci = br.leavers_ci if br.leavers_ci is not None else (np.nan, np.nan)
-                leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_df_survey)[0]
+                leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0]
 
         # ------------------------------------------------------------------
         # Step 20: Build the results dataclass
@@ -2372,7 +2376,8 @@ def fit(
                 if np.isfinite(delta_se):
                     effective_overall_se = delta_se
                     effective_overall_t, effective_overall_p, effective_overall_ci = safe_inference(
-                        delta_val, delta_se, alpha=self.alpha, df=_df_survey
+                        delta_val, delta_se, alpha=self.alpha,
+                        df=_inference_df(_df_survey, resolved_survey),
                     )
                 else:
                     effective_overall_se = float("nan")
@@ -2406,7 +2411,8 @@ def fit(
                         # Fallback: NaN SE (Phase 1 path or missing IF)
                         pl_se = float("nan")
                         pl_t, pl_p, pl_ci = safe_inference(
-                            pl_data["placebo_l"], pl_se, alpha=self.alpha, df=_df_survey
+                            pl_data["placebo_l"], pl_se, alpha=self.alpha,
+                            df=_inference_df(_df_survey, resolved_survey),
                         )
                         placebo_event_study_dict[-lag_l] = {
                             "effect": pl_data["placebo_l"],
@@ -2457,7 +2463,8 @@ def fit(
                             bs_ci if bs_ci is not None else (np.nan, np.nan)
                         )
                         placebo_event_study_dict[neg_key]["t_stat"] = safe_inference(
-                            eff, bs_se, alpha=self.alpha, df=_df_survey
+                            eff, bs_se, alpha=self.alpha,
+                            df=_inference_df(_df_survey, resolved_survey),
                         )[0]
 
         # Phase 2: build normalized_effects with SE
@@ -2470,7 +2477,7 @@ def fit(
                 # SE via delta method: SE(DID^n_l) = SE(DID_l) / delta^D_l
                 se_did_l = multi_horizon_se.get(l_h, float("nan"))
                 se_norm = se_did_l / denom if np.isfinite(denom) and denom > 0 else float("nan")
-                t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_df_survey)
+                t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))
                 normalized_effects_out[l_h] = {
                     "effect": eff,
                     "se": se_norm,
@@ -2529,7 +2536,8 @@ def fit(
                 else:
                     running_se_ub = float("nan")
                 cum_t, cum_p, cum_ci = safe_inference(
-                    cum_effect, running_se_ub, alpha=self.alpha, df=_df_survey
+                    cum_effect, running_se_ub, alpha=self.alpha,
+                    df=_inference_df(_df_survey, resolved_survey),
                 )
                 cumulated[l_h] = {
                     "effect": cum_effect,
@@ -2683,21 +2691,73 @@ def fit(
             effective_joiners_available = False
             effective_leavers_available = False
 
-        # Persist the final effective df_survey (post-heterogeneity,
-        # post-all-IF-sites) into survey_metadata so downstream
-        # consumers — HonestDiD bounds (honest_did.py:973 reads
-        # results.survey_metadata.df_survey), exported metadata, and
-        # users — all see the same df that the top-level dCDH inference
-        # actually used. Before this assignment, survey_metadata carries
-        # the design-level `resolved_survey.df_survey` from survey
-        # resolution; under replicate designs the effective df may be
-        # smaller (reduced via _replicate_n_valid_list). SurveyMetadata
-        # is a mutable @dataclass (diff_diff/survey.py:681), so direct
-        # attribute assignment is safe.
-        if survey_metadata is not None:
-            survey_metadata.df_survey = _effective_df_survey(
-                resolved_survey, _replicate_n_valid_list
+        # R2 P1b: Finalize replicate-df propagation across public
+        # surfaces. When heterogeneity is active, its n_valid is
+        # appended to `_replicate_n_valid_list` AFTER the main
+        # surfaces (overall / joiners / leavers / event study /
+        # placebo horizon) have already computed their t/p/CI fields
+        # with an intermediate `_df_survey`. If heterogeneity reports
+        # the smallest n_valid, the main-surface inference would be
+        # anti-conservative relative to `survey_metadata.df_survey`
+        # and HonestDiD. Re-run safe_inference with the FINAL
+        # effective df so every surface agrees.
+        _final_eff_df = _effective_df_survey(
+            resolved_survey, _replicate_n_valid_list
+        )
+        if _replicate_n_valid_list:
+            _final_inf_df = _inference_df(_final_eff_df, resolved_survey)
+            overall_t, overall_p, overall_ci = safe_inference(
+                overall_att, overall_se,
+                alpha=self.alpha, df=_final_inf_df,
             )
+            if joiners_available:
+                joiners_t, joiners_p, joiners_ci = safe_inference(
+                    joiners_att, joiners_se,
+                    alpha=self.alpha, df=_final_inf_df,
+                )
+            if leavers_available:
+                leavers_t, leavers_p, leavers_ci = safe_inference(
+                    leavers_att, leavers_se,
+                    alpha=self.alpha, df=_final_inf_df,
+                )
+            if multi_horizon_inference is not None:
+                for _lag_r2, _info_r2 in list(multi_horizon_inference.items()):
+                    _t_r2, _p_r2, _ci_r2 = safe_inference(
+                        _info_r2["effect"], _info_r2["se"],
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                    _info_r2["t_stat"] = _t_r2
+                    _info_r2["p_value"] = _p_r2
+                    _info_r2["conf_int"] = _ci_r2
+            if placebo_horizon_inference is not None:
+                for _lag_r2, _info_r2 in list(placebo_horizon_inference.items()):
+                    _t_r2, _p_r2, _ci_r2 = safe_inference(
+                        _info_r2["effect"], _info_r2["se"],
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                    _info_r2["t_stat"] = _t_r2
+                    _info_r2["p_value"] = _p_r2
+                    _info_r2["conf_int"] = _ci_r2
+            if heterogeneity_effects:
+                for _lag_r2, _info_r2 in list(heterogeneity_effects.items()):
+                    if np.isfinite(_info_r2["se"]):
+                        _t_r2, _p_r2, _ci_r2 = safe_inference(
+                            _info_r2["beta"], _info_r2["se"],
+                            alpha=self.alpha, df=_final_inf_df,
+                        )
+                        _info_r2["t_stat"] = _t_r2
+                        _info_r2["p_value"] = _p_r2
+                        _info_r2["conf_int"] = _ci_r2
+
+        # Persist the final effective df_survey into survey_metadata so
+        # downstream consumers — HonestDiD bounds (honest_did.py:973
+        # reads results.survey_metadata.df_survey), exported metadata,
+        # and users — all see the same df that the recomputed
+        # inference above used. SurveyMetadata is a mutable @dataclass
+        # (diff_diff/survey.py:681), so direct attribute assignment is
+        # safe.
+        if survey_metadata is not None:
+            survey_metadata.df_survey = _final_eff_df
 
         results = ChaisemartinDHaultfoeuilleResults(
             overall_att=effective_overall_att,
@@ -3496,17 +3556,18 @@ def _compute_heterogeneity_test(
         obs_gids_raw = np.asarray(obs_survey_info["group_ids"])
         obs_w_raw = np.asarray(obs_survey_info["weights"], dtype=np.float64)
         resolved = obs_survey_info["resolved"]
-        # df_s starts from the design-level df (n_psu - n_strata under TSL,
-        # R - 1 under replicate). It may be reduced further by replicate
-        # failures reported via replicate_n_valid_list (this function
-        # appends its own n_valid observations).
-        if (
-            replicate_n_valid_list is not None
-            and len(replicate_n_valid_list) > 0
-        ):
-            df_s = int(min(replicate_n_valid_list)) - 1
-        else:
-            df_s = resolved.df_survey if resolved is not None else None
+        # df_s starts from the shared effective df (base-capped by
+        # design df). Under replicate designs the helper handles the
+        # min(resolved.df_survey, min(n_valid) - 1) reduction and
+        # preserves None when the base df is undefined (QR-rank ≤ 1).
+        # Using the helper here — rather than re-deriving locally —
+        # keeps heterogeneity's df consistent with the main dCDH
+        # surfaces (R2 P1a). `list(... or [])` avoids accidental
+        # mutation of the caller's shared tracker at this site; the
+        # explicit append happens inside the horizon loop below.
+        df_s = _effective_df_survey(
+            resolved, list(replicate_n_valid_list or [])
+        )
         # Contract: only obs whose group is in the canonical post-filter
         # list contribute. Groups dropped upstream (Step 5b interior gaps,
         # Step 6 multi-switch) appear in obs_gids_raw but must be
@@ -3674,9 +3735,17 @@ def _compute_heterogeneity_test(
                 if replicate_n_valid_list is not None:
                     replicate_n_valid_list.append(n_valid_het)
                 # Reduce df_s to reflect this horizon's n_valid.
-                df_s_local = int(n_valid_het) - 1 if df_s is None else min(
-                    df_s, int(n_valid_het) - 1
-                )
+                # R2 P1a: when the shared base-capped df_s is None
+                # (undefined base df, e.g., QR-rank ≤ 1), the
+                # heterogeneity df MUST stay None — per-site n_valid
+                # cannot rescue a rank-deficient design. The
+                # _inference_df wrapper at the safe_inference call
+                # below coerces None to 0 under replicate, forcing
+                # NaN inference.
+                if df_s is None:
+                    df_s_local = None
+                else:
+                    df_s_local = min(int(df_s), int(n_valid_het) - 1)
             else:
                 var_s = compute_survey_if_variance(psi_obs, resolved)
                 df_s_local = df_s
@@ -3686,7 +3755,8 @@ def _compute_heterogeneity_test(
                 else float("nan")
             )
             t_stat, p_val, ci = safe_inference(
-                beta_het, se_het, alpha=alpha, df=df_s_local
+                beta_het, se_het, alpha=alpha,
+                df=_inference_df(df_s_local, resolved),
             )
 
         results[l_h] = {
@@ -5006,6 +5076,36 @@ def _validate_group_constant_strata_psu(
             )
 
 
+def _inference_df(
+    effective_df: Optional[int],
+    resolved_survey: Any,
+) -> Optional[int]:
+    """Coerce an effective-df value for use in ``safe_inference(df=...)``.
+
+    ``_effective_df_survey()`` returns ``None`` when the replicate
+    design's base df is undefined (QR-rank ≤ 1) or when the reduced
+    df would be < 1. ``safe_inference`` treats ``df=None`` as the
+    standard normal (z-inference) and only returns NaN for
+    ``df <= 0``. Under a replicate design, "undefined effective df"
+    MUST map to NaN inference per REGISTRY.md — NOT to z-inference.
+
+    Returns:
+    - ``effective_df`` when defined (t-inference with that df).
+    - ``0`` when ``effective_df is None`` AND ``resolved_survey`` uses
+      replicate variance — forces ``safe_inference`` to return NaN
+      t/p/CI via its ``df <= 0`` branch.
+    - ``None`` otherwise (no survey, or TSL design where the resolver
+      always returns an int df — z-inference is never reachable here).
+    """
+    if effective_df is not None:
+        return effective_df
+    if resolved_survey is None:
+        return None
+    if getattr(resolved_survey, "uses_replicate_variance", False):
+        return 0
+    return None
+
+
 def _effective_df_survey(
     resolved_survey: Any,
     replicate_n_valid_list: List[int],
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 60444f9e..8a95f40b 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 
 - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`.
 
-- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights.
+- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights.
 
 - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning.
 
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index c07d2cd9..5c873611 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -798,3 +798,108 @@ def test_dropped_replicate_reduces_df(self, base_panel):
             f"Expected persisted df to reflect reduced rank (< R - 1 = "
             f"{R - 1}), got {res.survey_metadata.df_survey}."
         )
+
+    def test_rank_1_replicate_forces_nan_inference(self, base_panel):
+        """All replicate columns identical → QR-rank = 1 → design df
+        undefined (None). Even if per-IF replicate SEs come back
+        finite, every public inference field (t_stat / p_value /
+        conf_int) MUST be NaN — otherwise users get z-based inference
+        silently instead of the NaN contract documented in REGISTRY.md.
+
+        R2 P0 regression: before the fix, ``_effective_df_survey``
+        returned None, ``safe_inference(df=None)`` computed z-based
+        statistics, and a rank-1 replicate design with finite SEs
+        would produce valid-looking (but wrong) p-values/CIs.
+        """
+        R = 8
+        df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3)
+        # Make every replicate column identical → QR-rank = 1.
+        rep_col_shared = df["rep0"].copy()
+        for r in range(R):
+            df[f"rep{r}"] = rep_col_shared
+        sd = _build_replicate_design(R, "BRR")
+        resolved = sd.resolve(df)
+        assert resolved.df_survey is None, (
+            "Rank-1 replicate matrix must have undefined design df"
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        # survey_metadata.df_survey stays None — the honest undefined
+        # signal for downstream consumers.
+        assert res.survey_metadata.df_survey is None
+        # Main dCDH inference fields are NaN via the _inference_df
+        # coercion (None → 0 under replicate → safe_inference NaN
+        # branch).
+        assert np.isnan(res.overall_t_stat)
+        assert np.isnan(res.overall_p_value)
+        assert not np.all(np.isfinite(res.overall_conf_int))
+        # Event study horizon 1 is also NaN.
+        if 1 in res.event_study_effects:
+            info = res.event_study_effects[1]
+            assert np.isnan(info["t_stat"])
+            assert np.isnan(info["p_value"])
+
+    def test_heterogeneity_replicate_cross_surface_df_consistency(
+        self, base_panel,
+    ):
+        """With ``heterogeneity=`` active under a rank-deficient
+        replicate design, every public surface — top-level dCDH,
+        event study, heterogeneity, ``survey_metadata.df_survey``,
+        and HonestDiD — MUST use the SAME final effective df.
+
+        R2 P1 regression: before the fix, the main surfaces
+        (``overall_*``, ``event_study_effects``) computed t/p/CI
+        with an intermediate ``_df_survey`` that did not include
+        heterogeneity's ``n_valid_het``. Then
+        ``survey_metadata.df_survey`` was overwritten with the
+        post-heterogeneity min — so HonestDiD used a different df
+        than the main dCDH surface.
+        """
+        from scipy import stats as _stats
+
+        R = 12
+        df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=7)
+        # Rank deficiency: 2 duplicated pairs → rank = R - 2 = 10.
+        df["rep1"] = df["rep0"]
+        df["rep3"] = df["rep2"]
+        sd = _build_replicate_design(R, "BRR")
+        resolved = sd.resolve(df)
+        design_df = resolved.df_survey
+        assert design_df is not None and design_df < R - 1
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+                heterogeneity="x_het",
+                honest_did=True,
+            )
+        final_df = res.survey_metadata.df_survey
+        # All sites report n_valid == R → effective df = min(design_df, R-1)
+        # = design_df.
+        assert final_df == design_df
+        # Verify overall inference was computed with df=final_df by
+        # checking p-value against t(final_df) — would disagree if
+        # df=R-1 or df=None was used.
+        t_stat = res.overall_att / res.overall_se
+        expected_p = 2 * _stats.t.sf(abs(t_stat), df=final_df)
+        assert res.overall_p_value == pytest.approx(expected_p, rel=1e-6)
+        # Same check for heterogeneity surface.
+        if res.heterogeneity_effects:
+            het_info = res.heterogeneity_effects[1]
+            if np.isfinite(het_info["se"]):
+                het_t = het_info["beta"] / het_info["se"]
+                expected_het_p = 2 * _stats.t.sf(abs(het_t), df=final_df)
+                assert het_info["p_value"] == pytest.approx(
+                    expected_het_p, rel=1e-6
+                )
+        # HonestDiD bounds were computed from the same survey_metadata;
+        # asserting the result attribute is populated confirms the df
+        # flow-through was exercised.
+        assert res.honest_did_results is not None

From b48ee534e3a0934d8767fa9f8c22eb24788ccbd6 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 10:25:29 -0400
Subject: [PATCH 07/11] Fix CI review R3: placebo-horizon key + registry df
 formula
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two non-blocker items from PR #311 AI review R3:

- P2: `test_placebo_under_replicate` queried `placebo_event_study[1]`
  but placebo horizons are stored under NEGATIVE keys (REGISTRY.md
  line 546 — `-l` corresponds to backward outcome difference
  `Y_{g, F_g-1-l} - Y_{g, F_g-1}`; storage at
  `chaisemartin_dhaultfoeuille.py:2402` uses `-lag_l` as the key).
  The test's assertion silently skipped (`.get(1)` returned None)
  so the replicate placebo SE path was effectively unverified.
  Fixed to use key `-1` and added explicit assertions on `se`,
  `p_value`, and `conf_int`.

- P3: REGISTRY line 653 described the replicate-effective df as
  `min(n_valid_across_sites) - 1`, but the actual rule (and the
  detailed heterogeneity note at line 618) is the capped form
  `min(resolved_survey.df_survey, min(n_valid) - 1)`. Updated the
  wording to include the design-df cap and the undefined-base-df
  NaN case — matching the code and line 618.

Full regression: 316 passing.
---
 docs/methodology/REGISTRY.md            |  2 +-
 tests/test_survey_dcdh_replicate_psu.py | 25 ++++++++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 8a95f40b..48c734f3 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -650,7 +650,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface
 - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`.
 - **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=<group column>` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). Under replicate-weight designs, the same group-level IF expansion is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed.
-- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(n_valid_across_sites) - 1` propagates to t-critical values.
+- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design.
 
 ---
 
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index 5c873611..baa36120 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -266,7 +266,12 @@ def test_multi_horizon_under_replicate(
         assert np.isfinite(info["se"]) and info["se"] > 0
 
     def test_placebo_under_replicate(self, replicate_design):
-        """Placebo DID^{pl}_l under replicate weights also carries finite SE."""
+        """Placebo DID^{pl}_l under replicate weights carries finite SE,
+        p-value, and CI. Placebo horizons are stored under NEGATIVE
+        keys in ``placebo_event_study`` (see REGISTRY.md line 546 —
+        ``-l`` corresponds to backward outcome difference
+        ``Y_{g, F_g-1-l} - Y_{g, F_g-1}``). Pre-R3-fix this test used
+        key ``1`` and silently skipped the assertion."""
         df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42)
         R = 20
         df = replicate_design(df, R=R, method="BRR")
@@ -280,10 +285,20 @@ def test_placebo_under_replicate(self, replicate_design):
             survey_design=sd,
             L_max=1,
         )
-        if res.placebo_event_study:
-            info = res.placebo_event_study.get(1)
-            if info is not None and info.get("n_obs", 0) > 0:
-                assert np.isfinite(info["se"])
+        assert res.placebo_event_study, (
+            "Expected placebo_event_study to be populated for L_max=1 "
+            "reversible panel"
+        )
+        # Negative key convention: placebo horizon l → key -l.
+        assert -1 in res.placebo_event_study, (
+            f"Expected key -1 in placebo_event_study (got keys "
+            f"{sorted(res.placebo_event_study)})"
+        )
+        info = res.placebo_event_study[-1]
+        assert info["n_obs"] > 0, "Placebo horizon should have observations"
+        assert np.isfinite(info["se"]) and info["se"] > 0
+        assert np.isfinite(info["p_value"])
+        assert np.all(np.isfinite(info["conf_int"]))
 
     def test_did_x_replicate(self, base_panel, replicate_design):
         """DID^X covariate adjustment flows through the Class A dispatch

From bcc14c41e18ac4a09ce36f87e044857c7e829250 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 10:46:21 -0400
Subject: [PATCH 08/11] Fix CI review R4: late-arriving n_valid propagation +
 doc drift
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses PR #311 AI review R4 (1 × P2 + 2 × P3).

P2: The cross-surface df consistency test used a duplicated-replicate
design where `resolved.df_survey` binds as the cap BEFORE
heterogeneity contributes. That scenario can't detect a broken
late-propagation path. Added a monkeypatch-based regression
(`test_heterogeneity_late_nvalid_propagates_to_all_surfaces`) that:

1. First counts the main-surface calls to
   `compute_replicate_if_variance` (no heterogeneity).
2. Then runs fit with `heterogeneity=` active and a patched helper
   that returns a SMALLER n_valid ONLY on the heterogeneity call
   (call index > main_count), leaving main surfaces at full n_valid.
3. Asserts every public surface (`overall_p_value`,
   `event_study_effects`, `placebo_event_study`,
   `heterogeneity_effects`, `survey_metadata.df_survey`) uses the
   reduced df via t(reduced_n_valid - 1) distribution.

The test caught two real bugs that the duplicated-column test
missed:

- `effective_overall_t/p/ci` are set by copying from the raw
  `overall_t/p/ci` at line ~2331 (before heterogeneity runs) and
  are used as the results-constructor inputs at line ~2776.
  The R2 recompute block only reassigned `overall_*` — so
  `effective_overall_*` (what actually shipped) stayed on the
  pre-heterogeneity df. Fix: recompute `effective_overall_*` from
  `effective_overall_att/se` (covering both the normal and
  cost-benefit delta paths), and keep `overall_*` in sync for
  downstream consumers.
- `placebo_event_study_dict[-lag]` holds VALUE copies of
  `placebo_horizon_inference[lag]`, so mutating the inner dict in
  the recompute block does NOT propagate to the public surface.
  Fix: explicitly update `placebo_event_study_dict[-lag]` alongside
  the mutation in the recompute loop.

P3a: REGISTRY overview line 466 still said "Replicate weights and
PSU-level bootstrap are deferred" — contradicting the updated
checklist at lines 651-653. Rewrote to match the new contract.

P3b: `diff_diff/guides/llms-full.txt:325` still said
"Replicate weights and PSU-level bootstrap deferred". Updated to
reflect replicate support + opt-in PSU-level Hall-Mammen bootstrap
+ the mutual-exclusion contract with `n_bootstrap > 0`.

Full regression: 327 passing.
---
 diff_diff/chaisemartin_dhaultfoeuille.py |  28 +++++
 diff_diff/guides/llms-full.txt           |   2 +-
 docs/methodology/REGISTRY.md             |   2 +-
 tests/test_survey_dcdh_replicate_psu.py  | 137 +++++++++++++++++++++++
 4 files changed, 167 insertions(+), 2 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index d4a6fb15..ed6c459f 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -2706,6 +2706,21 @@ def fit(
         )
         if _replicate_n_valid_list:
             _final_inf_df = _inference_df(_final_eff_df, resolved_survey)
+            # Recompute `effective_overall_*` directly — that's what
+            # ships in results at line ~2776+. `effective_overall_att/se`
+            # may differ from the raw `overall_att/se` under the delta
+            # (cost-benefit) path at L_max >= 2; recomputing from
+            # `effective_*` ensures both paths get the final df.
+            if np.isfinite(effective_overall_se):
+                effective_overall_t, effective_overall_p, effective_overall_ci = (
+                    safe_inference(
+                        effective_overall_att, effective_overall_se,
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                )
+            # Keep `overall_*` in sync for any downstream code that
+            # reads them directly (e.g., placebo_event_study_dict
+            # construction flows from overall_*).
             overall_t, overall_p, overall_ci = safe_inference(
                 overall_att, overall_se,
                 alpha=self.alpha, df=_final_inf_df,
@@ -2738,6 +2753,19 @@ def fit(
                     _info_r2["t_stat"] = _t_r2
                     _info_r2["p_value"] = _p_r2
                     _info_r2["conf_int"] = _ci_r2
+                    # `placebo_event_study_dict` holds VALUE copies
+                    # (not shared references) of the inner dicts, so
+                    # mutating `placebo_horizon_inference[lag]` above
+                    # does NOT propagate to the public surface. Update
+                    # the negative-key mirror explicitly so the
+                    # recomputed t/p/CI ship in results.
+                    if (
+                        placebo_event_study_dict is not None
+                        and -_lag_r2 in placebo_event_study_dict
+                    ):
+                        placebo_event_study_dict[-_lag_r2]["t_stat"] = _t_r2
+                        placebo_event_study_dict[-_lag_r2]["p_value"] = _p_r2
+                        placebo_event_study_dict[-_lag_r2]["conf_int"] = _ci_r2
             if heterogeneity_effects:
                 for _lag_r2, _info_r2 in list(heterogeneity_effects.items()):
                     if np.isfinite(_info_r2["se"]):
diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt
index 5efb8f13..7e9a4517 100644
--- a/diff_diff/guides/llms-full.txt
+++ b/diff_diff/guides/llms-full.txt
@@ -322,7 +322,7 @@ print(f"sigma_fe (sign-flipping threshold): {diagnostic.sigma_fe:.3f}")
 - Validated against R `DIDmultiplegtDYN` v2.3.3 at horizon `l = 1` via `tests/test_chaisemartin_dhaultfoeuille_parity.py`
 - Phase 1 placebo SE is intentionally `NaN` with a warning. The dynamic companion paper Section 3.7.3 derives the cohort-recentered analytical variance for `DID_l` only — not for the placebo `DID_M^pl`. Phase 2 will add multiplier-bootstrap support for the placebo. Until then, the placebo point estimate is meaningful but its inference fields stay NaN-consistent **even when `n_bootstrap > 0`** (bootstrap currently covers `DID_M`, `DID_+`, and `DID_-` only)
 - The analytical CI is conservative under Assumption 8 (independent groups) of the dynamic companion paper, exact only under iid sampling
-- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred
+- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR). Opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0`. Replicate + `n_bootstrap > 0` rejected with `NotImplementedError` (replicate variance is closed-form)
 
 ### SunAbraham
 
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 48c734f3..8d1643ca 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -463,7 +463,7 @@ The multiplier bootstrap uses random weights w_i with E[w]=0 and Var(w)=1:
 - [de Chaisemartin, C. & D'Haultfœuille, X. (2020). Two-Way Fixed Effects Estimators with Heterogeneous Treatment Effects. *American Economic Review*, 110(9), 2964-2996.](https://doi.org/10.1257/aer.20181169)
 - [de Chaisemartin, C. & D'Haultfœuille, X. (2022, revised 2024). Difference-in-Differences Estimators of Intertemporal Treatment Effects. NBER Working Paper 29873.](https://www.nber.org/papers/w29873) — Web Appendix Section 3.7.3 contains the cohort-recentered plug-in variance formula implemented here.
 
-**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization; replicate weights and PSU-level bootstrap are deferred. **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles.
+**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR) across all IF sites, plus opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0` (see the full checklist + Notes below for the contract). **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles.
 
 **Key implementation requirements:**
 
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index baa36120..a5a4856f 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -918,3 +918,140 @@ def test_heterogeneity_replicate_cross_surface_df_consistency(
         # asserting the result attribute is populated confirms the df
         # flow-through was exercised.
         assert res.honest_did_results is not None
+
+    def test_heterogeneity_late_nvalid_propagates_to_all_surfaces(
+        self, base_panel, monkeypatch,
+    ):
+        """Regression for PR #311 CI review R4 P2.
+
+        The duplicated-column test above exercises the design-df cap
+        (which binds BEFORE heterogeneity runs), so it cannot catch a
+        bug where the post-heterogeneity recompute block fails to
+        propagate the final df to every public surface. This test
+        isolates the late-arriving case by monkeypatching
+        `compute_replicate_if_variance` to return a SMALLER n_valid
+        only on the heterogeneity call site — after main surfaces
+        have already frozen their t/p/CI fields with the larger df.
+
+        Every public surface (`overall_*`, `event_study_effects`,
+        `placebo_event_study`, `heterogeneity_effects`,
+        `survey_metadata.df_survey`) must reflect the reduced df.
+        """
+        from scipy import stats as _stats
+        from diff_diff import survey as _survey_mod
+
+        R = 20
+        reduced_n_valid = 7  # small enough that (reduced - 1) < R - 1
+        df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5)
+        sd = _build_replicate_design(R, "JK1")
+        resolved = sd.resolve(df)
+        # Clean (full-rank) design — design df is NOT the binding cap.
+        assert resolved.df_survey == R - 1
+
+        original = _survey_mod.compute_replicate_if_variance
+
+        # Pass 1: count the main-surface calls to
+        # `compute_replicate_if_variance` (L_max=1, no heterogeneity).
+        main_counter = {"n": 0}
+
+        def count_only(psi, resolved_arg):
+            main_counter["n"] += 1
+            return original(psi, resolved_arg)
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance", count_only
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        main_call_count = main_counter["n"]
+        monkeypatch.undo()
+
+        # Pass 2: reduce n_valid on calls STRICTLY AFTER the main
+        # surfaces (i.e., heterogeneity's call).
+        tracker = {"count": 0}
+
+        def reduce_after_main(psi, resolved_arg):
+            tracker["count"] += 1
+            var, n_valid = original(psi, resolved_arg)
+            if tracker["count"] > main_call_count:
+                return var, reduced_n_valid
+            return var, n_valid
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance",
+            reduce_after_main,
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+                heterogeneity="x_het",
+            )
+
+        # Heterogeneity's smaller n_valid should be the binding final
+        # df: min(R - 1, reduced_n_valid - 1) = reduced_n_valid - 1.
+        expected_df = reduced_n_valid - 1
+        assert res.survey_metadata.df_survey == expected_df, (
+            f"survey_metadata.df_survey should be {expected_df} "
+            f"(reduced_n_valid - 1), got {res.survey_metadata.df_survey}"
+        )
+
+        # Overall surface: p-value must use the reduced df.
+        t_stat = res.overall_att / res.overall_se
+        expected_p_overall = 2 * _stats.t.sf(abs(t_stat), df=expected_df)
+        assert res.overall_p_value == pytest.approx(
+            expected_p_overall, rel=1e-6,
+        ), (
+            "overall_p_value should reflect heterogeneity's late "
+            "n_valid via the post-recompute block"
+        )
+
+        # Event study (L=1) surface.
+        if res.event_study_effects and 1 in res.event_study_effects:
+            info = res.event_study_effects[1]
+            if np.isfinite(info["se"]) and info["se"] > 0:
+                t_l = info["effect"] / info["se"]
+                expected_p_l = 2 * _stats.t.sf(abs(t_l), df=expected_df)
+                assert info["p_value"] == pytest.approx(
+                    expected_p_l, rel=1e-6,
+                )
+
+        # Placebo event study surface (NEGATIVE keys). This is the
+        # one the copied-value bug would hide: `placebo_event_study`
+        # holds a VALUE copy of `placebo_horizon_inference`, so a
+        # broken recompute block would leave stale t/p/CI here.
+        if res.placebo_event_study:
+            for _lag_neg, pl_info in res.placebo_event_study.items():
+                if np.isfinite(pl_info["se"]) and pl_info["se"] > 0:
+                    t_pl = pl_info["effect"] / pl_info["se"]
+                    expected_p_pl = 2 * _stats.t.sf(
+                        abs(t_pl), df=expected_df,
+                    )
+                    assert pl_info["p_value"] == pytest.approx(
+                        expected_p_pl, rel=1e-6,
+                    ), (
+                        f"placebo_event_study[{_lag_neg}] p-value "
+                        f"must reflect the reduced df "
+                        f"({expected_df}); got "
+                        f"{pl_info['p_value']} vs expected "
+                        f"{expected_p_pl}"
+                    )
+
+        # Heterogeneity surface.
+        if res.heterogeneity_effects:
+            het_info = res.heterogeneity_effects[1]
+            if np.isfinite(het_info["se"]) and het_info["se"] > 0:
+                het_t = het_info["beta"] / het_info["se"]
+                expected_het_p = 2 * _stats.t.sf(
+                    abs(het_t), df=expected_df,
+                )
+                assert het_info["p_value"] == pytest.approx(
+                    expected_het_p, rel=1e-6,
+                )

From abdfea4630e86fd7079870ee3602696871d2068f Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 11:01:39 -0400
Subject: [PATCH 09/11] Fix CI review R5: propagate final df to
 normalized_effects + all-NaN branch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses PR #311 AI review R5 P0 + P2.

P0: The R2 post-heterogeneity finalization block updated overall,
joiners, leavers, multi-horizon event-study, placebo-horizon, and
heterogeneity surfaces, but missed `normalized_effects_out` — which
is built at line 2472-2487 from a delta-method SE (`SE(DID_l) /
delta^D_l`) using the pre-heterogeneity `_df_survey`. Under a
late-arriving replicate `n_valid` from heterogeneity, the public
`results.normalized_effects` dict shipped stale p-values/CIs and
could miss the all-NaN gate when the final replicate df turned
undefined.

Fix: extend the finalization block to also recompute every
`normalized_effects_out[h]` entry with `_final_inf_df`.

P2: Extended `test_heterogeneity_late_nvalid_propagates_to_all_
surfaces` to assert `res.normalized_effects` p-values match
`t(final_df)` distribution. Added a new case
`test_late_nvalid_below_two_forces_all_surfaces_nan` that drives
the final `n_valid` to 1 (reduced df = 0 → None via `_inference_df`
coercion) and verifies EVERY public surface (overall,
event_study_effects, placebo_event_study, heterogeneity_effects,
normalized_effects) flips to NaN — not silently keeps the finite
values computed with the larger intermediate df.

Full regression: 328 passing.
---
 diff_diff/chaisemartin_dhaultfoeuille.py |  13 +++
 tests/test_survey_dcdh_replicate_psu.py  | 109 +++++++++++++++++++++++
 2 files changed, 122 insertions(+)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index ed6c459f..e26b612c 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -2776,6 +2776,19 @@ def fit(
                         _info_r2["t_stat"] = _t_r2
                         _info_r2["p_value"] = _p_r2
                         _info_r2["conf_int"] = _ci_r2
+            # Normalized effects: another public surface built with the
+            # pre-heterogeneity `_df_survey`. Recompute inference with
+            # the final df so t/p/CI match the other surfaces (and the
+            # NaN contract when the final df becomes undefined).
+            if normalized_effects_out is not None:
+                for _lag_r2, _info_r2 in list(normalized_effects_out.items()):
+                    _t_r2, _p_r2, _ci_r2 = safe_inference(
+                        _info_r2["effect"], _info_r2["se"],
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                    _info_r2["t_stat"] = _t_r2
+                    _info_r2["p_value"] = _p_r2
+                    _info_r2["conf_int"] = _ci_r2
 
         # Persist the final effective df_survey into survey_metadata so
         # downstream consumers — HonestDiD bounds (honest_did.py:973
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
index a5a4856f..595f80bd 100644
--- a/tests/test_survey_dcdh_replicate_psu.py
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -1055,3 +1055,112 @@ def reduce_after_main(psi, resolved_arg):
                 assert het_info["p_value"] == pytest.approx(
                     expected_het_p, rel=1e-6,
                 )
+
+        # Normalized effects surface. This is a DERIVED dict built
+        # pre-heterogeneity (delta-method SE from per-horizon SE) —
+        # same class of "copy" bug as placebo_event_study. R4 P0
+        # regression: without the recompute, normalized_effects would
+        # silently ship p_value/conf_int computed with the
+        # pre-heterogeneity df.
+        if res.normalized_effects:
+            for _lag, norm_info in res.normalized_effects.items():
+                if np.isfinite(norm_info["se"]) and norm_info["se"] > 0:
+                    norm_t = norm_info["effect"] / norm_info["se"]
+                    expected_norm_p = 2 * _stats.t.sf(
+                        abs(norm_t), df=expected_df,
+                    )
+                    assert norm_info["p_value"] == pytest.approx(
+                        expected_norm_p, rel=1e-6,
+                    ), (
+                        f"normalized_effects[{_lag}] p-value must "
+                        f"reflect the reduced df ({expected_df}); got "
+                        f"{norm_info['p_value']} vs expected "
+                        f"{expected_norm_p}"
+                    )
+
+    def test_late_nvalid_below_two_forces_all_surfaces_nan(
+        self, base_panel, monkeypatch,
+    ):
+        """Regression for R4 P2 follow-up: when heterogeneity's late
+        replicate failures drive the final `n_valid` to 1 (reduced df
+        = 0), every public inference surface must flip to NaN — not
+        silently keep the finite values that were computed with a
+        larger intermediate df.
+
+        `_inference_df` coerces df=0 under replicate → safe_inference
+        NaN branch. Exercises the all-NaN contract end to end.
+        """
+        from diff_diff import survey as _survey_mod
+
+        R = 15
+        df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5)
+        sd = _build_replicate_design(R, "JK1")
+        resolved = sd.resolve(df)
+        assert resolved.df_survey == R - 1
+
+        original = _survey_mod.compute_replicate_if_variance
+
+        # Count main-only calls.
+        main_counter = {"n": 0}
+
+        def count_only(psi, resolved_arg):
+            main_counter["n"] += 1
+            return original(psi, resolved_arg)
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance", count_only
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        main_call_count = main_counter["n"]
+        monkeypatch.undo()
+
+        # Return n_valid=1 on heterogeneity's call → reduced df = 0
+        # → _inference_df coerces to NaN.
+        tracker = {"count": 0}
+
+        def reduce_to_one(psi, resolved_arg):
+            tracker["count"] += 1
+            var, n_valid = original(psi, resolved_arg)
+            if tracker["count"] > main_call_count:
+                return var, 1
+            return var, n_valid
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance", reduce_to_one
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+                heterogeneity="x_het",
+            )
+        # Final effective df is None (reduced_df = 0 < 1 guard).
+        assert res.survey_metadata.df_survey is None
+        # Every public inference surface must be NaN.
+        assert np.isnan(res.overall_t_stat)
+        assert np.isnan(res.overall_p_value)
+        assert not np.all(np.isfinite(res.overall_conf_int))
+        if res.event_study_effects:
+            for info in res.event_study_effects.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])
+        if res.placebo_event_study:
+            for info in res.placebo_event_study.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])
+        if res.heterogeneity_effects:
+            for info in res.heterogeneity_effects.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])
+        if res.normalized_effects:
+            for info in res.normalized_effects.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])

From 41474069e87331a1019a4177666ca4cb52b80a9f Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 11:20:27 -0400
Subject: [PATCH 10/11] Fix CI review R6: warning gate on eligible groups +
 docstring updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses PR #311 AI review R6 (2 × P3 cleanups).

P3 #1: Warning gate was computed from raw positive-weight groups,
not the post-filter eligible-group set used to build the bootstrap
PSU map. Panels where upstream dCDH filtering drops groups that
share PSUs with kept groups could emit a misleading "PSU coarser
than group" warning even when the effective bootstrap is one group
per PSU.

Fix: count PSUs and groups from `_eligible_group_ids` (the same set
feeding `group_id_to_psu_code_bootstrap`), preserving the within-
group-constant-PSU invariant by taking each eligible group's first
positive-weight PSU label.

P3 #2: Two docstrings said the bootstrap is "clustered at the group
level" only — now incomplete after the PSU-level survey path:
- `diff_diff/chaisemartin_dhaultfoeuille.py` class docstring:
  extended to note PSU-level Hall-Mammen wild clustering under
  `survey_design` with coarser PSU.
- `diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py` module
  docstring: documents the identity-map fast path (auto-inject
  `psu=group`), the PSU-level broadcast when PSU is strictly
  coarser, and points to REGISTRY.md for the full contract.

Full regression: 318 passing.
---
 diff_diff/chaisemartin_dhaultfoeuille.py      | 44 ++++++++++++++-----
 .../chaisemartin_dhaultfoeuille_bootstrap.py  | 17 ++++---
 2 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index e26b612c..d449d59a 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -307,7 +307,10 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin):
       default; gate via ``placebo=False``)
     - Analytical SE via the cohort-recentered plug-in formula from
       Web Appendix Section 3.7.3 of the dynamic paper
-    - Optional multiplier bootstrap clustered at the group level
+    - Optional multiplier bootstrap, clustered at the group level by
+      default; under ``survey_design`` with a strictly-coarser PSU, the
+      bootstrap switches to PSU-level Hall-Mammen wild clustering (see
+      REGISTRY.md ChaisemartinDHaultfoeuille Note on survey + bootstrap)
     - Optional TWFE decomposition diagnostic from Theorem 1 of AER 2020
       (per-cell weights, fraction negative, ``sigma_fe``)
 
@@ -1970,23 +1973,40 @@ def fit(
                 # psu=<group_col>), groups and PSUs coincide so
                 # Hall-Mammen wild PSU bootstrap equals group-level
                 # multiplier bootstrap — no need to warn.
+                # Count groups/PSUs on the POST-FILTER eligible set
+                # (`_eligible_group_ids`) — the same set the bootstrap
+                # map is built from below. Using raw positive-weight
+                # groups here would emit a misleading warning on
+                # panels where upstream dCDH filtering drops groups
+                # that happen to share PSUs with kept groups.
                 psu_arr_warn = getattr(resolved_survey, "psu", None)
                 if psu_arr_warn is None or _obs_survey_info is None:
                     # No PSU info — can't compare to group count.
                     n_psu_eff_warn, n_groups_eff_warn = -1, -1
                 else:
-                    pos_mask_warn = (
-                        np.asarray(
-                            _obs_survey_info["weights"], dtype=np.float64
-                        )
-                        > 0
+                    obs_gids_warn = np.asarray(_obs_survey_info["group_ids"])
+                    obs_ws_warn = np.asarray(
+                        _obs_survey_info["weights"], dtype=np.float64
+                    )
+                    pos_mask_warn = obs_ws_warn > 0
+                    psu_codes_warn = np.asarray(psu_arr_warn)
+                    # Collect the PSU label for each variance-eligible
+                    # group (within-group-constant PSU is validated
+                    # upstream, so the first positive-weight label
+                    # represents the whole group).
+                    eligible_psu_labels: List[Any] = []
+                    for gid in _eligible_group_ids:
+                        mask_g = (obs_gids_warn == gid) & pos_mask_warn
+                        if mask_g.any():
+                            eligible_psu_labels.append(
+                                psu_codes_warn[mask_g][0]
+                            )
+                    n_groups_eff_warn = len(eligible_psu_labels)
+                    n_psu_eff_warn = (
+                        int(len(np.unique(np.asarray(eligible_psu_labels))))
+                        if eligible_psu_labels
+                        else -1
                     )
-                    psu_eff = np.asarray(psu_arr_warn)[pos_mask_warn]
-                    gids_eff_warn = np.asarray(
-                        _obs_survey_info["group_ids"]
-                    )[pos_mask_warn]
-                    n_psu_eff_warn = int(len(np.unique(psu_eff)))
-                    n_groups_eff_warn = int(len(np.unique(gids_eff_warn)))
                 if 0 <= n_psu_eff_warn < n_groups_eff_warn:
                     warnings.warn(
                         f"Bootstrap with survey_design uses Hall-Mammen "
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
index a456890e..27552296 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
@@ -4,11 +4,18 @@
 
 The dCDH papers prescribe only the analytical cohort-recentered plug-in
 variance from Web Appendix Section 3.7.3 of the dynamic companion paper.
-This module adds an opt-in multiplier bootstrap clustered at the group
-level, matching the inference convention used by ``CallawaySantAnna``,
-``ImputationDiD``, and ``TwoStageDiD``. The bootstrap is a library
-extension, not a paper requirement, and is documented as such in
-``REGISTRY.md``.
+This module adds an opt-in multiplier bootstrap, clustered at the group
+level by default (matching the inference convention used by
+``CallawaySantAnna``, ``ImputationDiD``, and ``TwoStageDiD``). Under
+``survey_design`` with an explicitly-coarser PSU, the bootstrap switches
+to PSU-level Hall-Mammen wild clustering: each PSU draws a single
+multiplier and all groups within that PSU share it
+(see ``_generate_psu_or_group_weights`` and ``_map_for_target`` below,
+plus the REGISTRY.md ``ChaisemartinDHaultfoeuille`` Note on survey +
+bootstrap). Under the default auto-inject ``psu=group`` each group is
+its own PSU and the identity-map fast path reproduces the original
+group-level behavior bit-for-bit. The bootstrap is a library extension,
+not a paper requirement, and is documented as such in ``REGISTRY.md``.
 
 The mixin operates on **pre-computed cohort-centered influence-function
 values**: the main estimator class computes per-group ``U^G_g`` values

From 9368ae65b89c347e469c9d711607e50acf434a5a Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 11:38:33 -0400
Subject: [PATCH 11/11] Fix CI review R7: reconcile stale cluster/placebo
 docstrings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses PR #311 AI review R7 (2 × P3 doc drift cleanups).

R7 P3 #1: Several sites still said dCDH "always clusters at the
group level" — which was true when the PR was written but is now
incomplete given the PSU-level Hall-Mammen wild bootstrap path
under `survey_design`. Updated to distinguish user-specified
`cluster=` (still unsupported, raises NotImplementedError) from
automatic PSU-level clustering (takes over under `survey_design`
with strictly-coarser PSUs; identity under auto-inject `psu=group`):
- `docs/methodology/REGISTRY.md:592` Note (cluster contract) —
  rewrote to describe both paths; dropped "Phase 1" framing.
- `docs/methodology/REGISTRY.md:636` checklist — added the
  automatic PSU-level upgrade clause.
- `diff_diff/chaisemartin_dhaultfoeuille.py:321` constructor
  docstring — same contract split.
- `diff_diff/chaisemartin_dhaultfoeuille.py:432` / `:503`
  `cluster=` error messages — removed "Phase 1" phrasing, added
  PSU-level-under-survey_design context.
- `tests/test_chaisemartin_dhaultfoeuille.py:405` regex updated
  to match the new error wording (no longer pins "Phase 1").

R7 P3 #2: `diff_diff/guides/llms-full.txt:321` said Phase 2 will
add multiplier-bootstrap support for placebo and bootstrap covers
`DID_M`, `DID_+`, `DID_-` only — both stale after this PR's
L_max >= 1 placebo and event-study bootstrap paths. Rewrote to
scope the NaN-SE contract to `L_max=None` only and describe the
full bootstrap coverage (overall, joiners, leavers, per-horizon
event-study, placebo horizons, shared weights for sup-t bands).

Full regression: 336 passing.
---
 diff_diff/chaisemartin_dhaultfoeuille.py  | 45 +++++++++++++----------
 diff_diff/guides/llms-full.txt            |  2 +-
 docs/methodology/REGISTRY.md              |  4 +-
 tests/test_chaisemartin_dhaultfoeuille.py | 29 ++++++++-------
 4 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index d449d59a..d204645c 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -319,14 +319,17 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin):
     alpha : float, default=0.05
         Significance level for confidence intervals.
     cluster : str, optional, default=None
-        **Phase 1 contract:** ``cluster`` must be ``None`` (the default).
-        dCDH always clusters at the group level via the cohort-recentered
-        influence-function plug-in (analytical SEs) and the multiplier
-        bootstrap (also grouped at the ``group`` column). Passing any
-        non-``None`` value raises ``NotImplementedError`` with a Phase 1
-        pointer. Custom clustering at a coarser or finer level than the
-        group is reserved for a future phase. See REGISTRY.md
-        ``ChaisemartinDHaultfoeuille`` section for the full contract.
+        **Cluster contract:** ``cluster`` must be ``None`` (the default).
+        dCDH clusters at the group level by default via the cohort-
+        recentered influence-function plug-in (analytical SEs) and the
+        multiplier bootstrap. Passing any non-``None`` value raises
+        ``NotImplementedError`` — user-specified custom clustering is
+        reserved for a future phase. When ``survey_design`` is provided
+        with strictly-coarser PSUs, the multiplier bootstrap
+        automatically upgrades to PSU-level Hall-Mammen wild clustering
+        (see REGISTRY.md ``ChaisemartinDHaultfoeuille`` Note on survey +
+        bootstrap). Under the default auto-inject ``psu=group``, the
+        group-level and PSU-level paths are bit-identical.
     n_bootstrap : int, default=0
         Number of multiplier-bootstrap iterations. ``0`` (default) uses
         only the analytical SE. Set to ``999`` or higher for stable
@@ -431,12 +434,14 @@ def __init__(
             raise ValueError(f"n_bootstrap must be non-negative, got {n_bootstrap}")
         if cluster is not None:
             raise NotImplementedError(
-                f"cluster={cluster!r}: custom clustering is not supported in "
-                f"Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters "
-                f"at the group level via the cohort-recentered influence-"
-                f"function plug-in (analytical SEs) and the multiplier "
-                f"bootstrap (also grouped at the group column). To use the "
-                f"supported group-level clustering, pass cluster=None (the "
+                f"cluster={cluster!r}: user-specified clustering is not "
+                f"supported in ChaisemartinDHaultfoeuille. dCDH clusters at "
+                f"the group level by default via the cohort-recentered "
+                f"influence-function plug-in (analytical SEs) and the "
+                f"multiplier bootstrap. Under survey_design with strictly-"
+                f"coarser PSUs, bootstrap clustering automatically upgrades "
+                f"to PSU-level Hall-Mammen wild. To use the default path, "
+                f"pass cluster=None (the "
                 f"default). Custom clustering is reserved for a future "
                 f"phase. See REGISTRY.md ChaisemartinDHaultfoeuille section "
                 f"for the full contract."
@@ -502,11 +507,13 @@ def set_params(self, **params: Any) -> "ChaisemartinDHaultfoeuille":
             raise ValueError(f"n_bootstrap must be non-negative, got {self.n_bootstrap}")
         if self.cluster is not None:
             raise NotImplementedError(
-                f"cluster={self.cluster!r}: custom clustering is not supported "
-                f"in Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters "
-                f"at the group level. To use the supported group-level "
-                f"clustering, pass cluster=None (the default). Custom clustering "
-                f"is reserved for a future phase. See REGISTRY.md "
+                f"cluster={self.cluster!r}: user-specified clustering is "
+                f"not supported in ChaisemartinDHaultfoeuille. dCDH clusters "
+                f"at the group level by default; under survey_design with "
+                f"strictly-coarser PSUs the bootstrap automatically upgrades "
+                f"to PSU-level Hall-Mammen wild clustering. Pass cluster=None "
+                f"(the default) to use this path. User-specified custom "
+                f"clustering is reserved for a future phase. See REGISTRY.md "
                 f"ChaisemartinDHaultfoeuille section for the full contract."
             )
         return self
diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt
index 7e9a4517..e14c4d7f 100644
--- a/diff_diff/guides/llms-full.txt
+++ b/diff_diff/guides/llms-full.txt
@@ -320,7 +320,7 @@ print(f"sigma_fe (sign-flipping threshold): {diagnostic.sigma_fe:.3f}")
 
 **Notes:**
 - Validated against R `DIDmultiplegtDYN` v2.3.3 at horizon `l = 1` via `tests/test_chaisemartin_dhaultfoeuille_parity.py`
-- Phase 1 placebo SE is intentionally `NaN` with a warning. The dynamic companion paper Section 3.7.3 derives the cohort-recentered analytical variance for `DID_l` only — not for the placebo `DID_M^pl`. Phase 2 will add multiplier-bootstrap support for the placebo. Until then, the placebo point estimate is meaningful but its inference fields stay NaN-consistent **even when `n_bootstrap > 0`** (bootstrap currently covers `DID_M`, `DID_+`, and `DID_-` only)
+- Single-period placebo `DID_M^pl` (i.e. `L_max=None`) SE is intentionally `NaN` with a warning — the per-period placebo aggregation has no influence-function derivation. Multi-horizon placebos (`L_max >= 1`) have analytical SE (cohort-recentered IF, library extension beyond the paper's Theorem 1) and bootstrap SE via the shared multiplier-bootstrap path. Bootstrap coverage at `L_max >= 1` spans `DID_M`, `DID_+`, `DID_-`, per-horizon event-study effects (`event_study_effects[l]`), and placebo horizons (`placebo_event_study[-l]`), with shared weights across horizons for valid joint (sup-t) bands
 - The analytical CI is conservative under Assumption 8 (independent groups) of the dynamic companion paper, exact only under iid sampling
 - Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR). Opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0`. Replicate + `n_bootstrap > 0` rejected with `NotImplementedError` (replicate variance is closed-form)
 
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 8d1643ca..86008e37 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -589,7 +589,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 
 - **Note:** When every variance-eligible group forms its own `(D_{g,1}, F_g, S_g)` cohort (a degenerate small-panel case where the cohort framework has zero degrees of freedom), the cohort-recentered plug-in formula is unidentified: cohort recentering subtracts the cohort mean from each group's `U^G_g`, and for singleton cohorts the centered value is exactly zero, so the centered influence function vector collapses to all zeros. The estimator returns `overall_se = NaN` with a `UserWarning` rather than silently collapsing to `0.0` (which would falsely imply infinite precision). The `DID_M` point estimate remains well-defined. The bootstrap path inherits the same degeneracy on these panels — the multiplier weights act on an all-zero vector, so the bootstrap distribution is also degenerate. **Deviation from R `DIDmultiplegtDYN`:** R returns a non-zero SE on the canonical 4-group worked example via small-sample sandwich machinery that Python does not implement. Both responses are valid for a degenerate case; Python's `NaN`+warning is the safer default. To get a non-degenerate SE, include more groups so cohorts have peers (real-world panels typically have `G >> K`).
 
-- **Note (Phase 1 cluster contract):** `ChaisemartinDHaultfoeuille` always clusters at the group level. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group; both inference paths cluster at the user's `group` column with no other option. The constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` with a Phase 1 pointer at construction time (and the same gate fires from `set_params`). Custom clustering at a coarser or finer level than the group is reserved for a future phase. The matching test is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`.
+- **Note (cluster contract):** `ChaisemartinDHaultfoeuille` clusters at the group level by default. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group. The user-facing `cluster=` kwarg is not supported: the constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` at construction time (and the same gate fires from `set_params`) — custom user-specified clustering is reserved for a future phase. **Automatic PSU-level clustering under `survey_design`:** when `survey_design` carries PSUs strictly coarser than the group and `n_bootstrap > 0`, the multiplier bootstrap automatically switches to Hall-Mammen wild PSU clustering (see the Note on survey + bootstrap below). Under the default auto-inject `psu=group`, PSU coincides with group and the group-level and PSU-level paths are identical (bit-for-bit via the identity-map fast path). The matching test for the `cluster=` gate is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`.
 
 - **Note (bootstrap inference surface):** When `n_bootstrap > 0`, the top-level `results.overall_p_value` / `results.overall_conf_int` (and joiners/leavers analogues) hold **percentile-based bootstrap inference** computed by the multiplier bootstrap, NOT normal-theory recomputations from the bootstrap SE. The t-stat (`overall_t_stat`, etc.) is computed from the SE via `safe_inference()[0]` to satisfy the project's anti-pattern rule (never compute `t = effect / se` inline) — bootstrap does not define an alternative t-stat semantic for percentile bootstrap, so the SE-based t-stat is the natural choice. `event_study_effects[1]`, `summary()`, `to_dataframe()`, `is_significant`, and `significance_stars` all read from these top-level fields and therefore reflect the bootstrap inference automatically. The library precedent for this propagation is `imputation.py:790-805`, `two_stage.py:778-787`, and `efficient_did.py:1009-1013`. The single-period placebo (`L_max=None`) still has NaN bootstrap fields; multi-horizon placebos (`L_max >= 1`) have valid bootstrap SE/CI/p via `placebo_horizon_ses/cis/p_values` on the bootstrap results object. The matching test is `test_bootstrap_p_value_and_ci_propagated_to_top_level` in `tests/test_chaisemartin_dhaultfoeuille.py::TestBootstrap`.
 
@@ -633,7 +633,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 - [x] Single-lag placebo `DID_M^pl` (point estimate; SE deferred to Phase 2)
 - [x] TWFE decomposition diagnostic (Theorem 1 of AER 2020): per-cell weights, fraction negative, `sigma_fe`, `beta_fe`
 - [x] Standalone `twowayfeweights()` helper for users who only want the TWFE diagnostic
-- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group
+- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group by default; automatically upgraded to PSU-level Hall-Mammen wild clustering under `survey_design` with strictly-coarser PSUs
 - [x] `drop_larger_lower=True` default (matches R `DIDmultiplegtDYN`); `False` opt-in with explicit inconsistency warning
 - [x] Singleton-baseline filter (footnote 15 of dynamic paper, variance computation only) with explicit warning
 - [x] Never-switching groups participate in the variance via stable-control roles after the Round 2 full-IF fix; `n_groups_dropped_never_switching` field retained as backwards-compatibility metadata only
diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py
index 1193ce60..081d9e92 100644
--- a/tests/test_chaisemartin_dhaultfoeuille.py
+++ b/tests/test_chaisemartin_dhaultfoeuille.py
@@ -404,32 +404,35 @@ def test_survey_design_rejects_fweight(self, data):
 
     def test_cluster_parameter_raises_not_implemented(self, data):
         """
-        Per Phase 1 cluster contract: dCDH always clusters at the
-        group level via the cohort-recentered influence function
-        (analytical SEs) and the multiplier bootstrap (also grouped at
-        the group column). Custom clustering is not supported in
-        Phase 1.
+        Per the dCDH cluster contract: dCDH clusters at the group
+        level by default via the cohort-recentered influence function
+        (analytical SEs) and the multiplier bootstrap. Under
+        ``survey_design`` with strictly-coarser PSUs the bootstrap
+        automatically upgrades to PSU-level Hall-Mammen wild. User-
+        specified clustering via the ``cluster=`` kwarg is not
+        supported.
 
         The reviewer flagged that ``cluster`` was previously accepted
         on ``__init__`` and stored on ``self.cluster`` but never
         actually read by ``fit()`` or ``_compute_dcdh_bootstrap()``,
-        making it a silent no-op. This test pins the new contract: any
+        making it a silent no-op. This test pins the contract: any
         non-None cluster value raises ``NotImplementedError`` at
         construction time with a message naming the offending value
-        and pointing at the Phase 1 reservation. The same gate fires
-        from ``set_params``.
+        and pointing at the no-custom-clustering reservation. The
+        same gate fires from ``set_params``.
 
-        See REGISTRY.md ``Note (Phase 1 cluster contract)``.
+        See REGISTRY.md ``Note (cluster contract)``.
         """
+        pattern = r"cluster.*(not supported|reserved for a future)"
         # __init__ rejects any non-None cluster
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             ChaisemartinDHaultfoeuille(cluster="state")
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             ChaisemartinDHaultfoeuille(cluster="unit")
 
         # set_params after construction also rejects
         est = ChaisemartinDHaultfoeuille()
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             est.set_params(cluster="state")
 
         # cluster=None still works (the only supported value)
@@ -439,7 +442,7 @@ def test_cluster_parameter_raises_not_implemented(self, data):
 
         # The convenience function also rejects (forward-compat gate
         # propagates through the wrapper at __init__ time)
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             chaisemartin_dhaultfoeuille(
                 data,
                 outcome="outcome",