diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index 07f6c6f3..cf89c6c3 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -309,7 +309,10 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin):
       (computed automatically by default; gate via ``placebo=False``)
     - Analytical SE via the cohort-recentered plug-in formula from Web
       Appendix Section 3.7.3; multiplier bootstrap clustered at the group
-      level via ``n_bootstrap``
+      level by default via ``n_bootstrap``; under ``survey_design`` with
+      strictly-coarser PSUs the bootstrap automatically upgrades to
+      PSU-level Hall-Mammen wild clustering (see REGISTRY.md
+      ``ChaisemartinDHaultfoeuille`` Note on survey + bootstrap)
     - Normalized estimator ``DID^n_l``, cost-benefit aggregate ``delta``,
       and sup-t simultaneous confidence bands
     - Residualization-style covariate adjustment (``DID^X``) via
@@ -317,8 +320,9 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin):
       ``trends_linear=True``, state-set-specific trends via
       ``trends_nonparam=``, heterogeneity testing, non-binary treatment,
       HonestDiD sensitivity integration on placebos via ``honest_did=True``
-    - Survey support via ``survey_design=`` (pweight + strata/PSU/FPC with
-      Taylor-series linearization)
+    - Survey support via ``survey_design=``: pweight with strata/PSU/FPC
+      via Taylor Series Linearization (analytical) or replicate-weight
+      variance (BRR/Fay/JK1/JKn/SDR)
     - TWFE decomposition diagnostic from Theorem 1 of AER 2020
 
     Only ``aggregate`` on :meth:`fit` still raises ``NotImplementedError``.
@@ -328,13 +332,27 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin):
     alpha : float, default=0.05
         Significance level for confidence intervals.
     cluster : str, optional, default=None
-        Must be ``None`` (the default). dCDH always clusters at the group
-        level via the cohort-recentered influence-function plug-in
-        (analytical SEs) and the multiplier bootstrap (also grouped at the
-        ``group`` column). Passing any non-``None`` value raises
-        ``NotImplementedError``. Custom clustering at a coarser or finer
-        level than the group is a planned extension. See REGISTRY.md
-        ``ChaisemartinDHaultfoeuille`` section for the full contract.
+        Must be ``None`` (the default). User-specified clustering via
+        this kwarg is not supported — passing any non-``None`` value
+        raises ``NotImplementedError`` at construction time (and the
+        same gate fires from ``set_params``). The effective clustering
+        depends on how you call ``fit()``:
+
+        - **Default (no survey_design)**: clustered at the group level
+          via the cohort-recentered influence-function plug-in
+          (analytical SEs) and the multiplier bootstrap.
+        - **Under ``survey_design`` with auto-inject or explicit
+          ``psu=group``**: PSU coincides with the group and the
+          group-level and PSU-level paths are bit-identical.
+        - **Under ``survey_design`` with strictly-coarser PSUs**: the
+          multiplier bootstrap automatically upgrades to PSU-level
+          Hall-Mammen wild clustering.
+
+        So dCDH does NOT always cluster at the group level — see
+        REGISTRY.md ``ChaisemartinDHaultfoeuille`` Notes on cluster
+        contract and survey + bootstrap for the full matrix. Custom
+        user-specified clustering at a coarser or finer level than the
+        group is a planned extension.
     n_bootstrap : int, default=0
         Number of multiplier-bootstrap iterations. ``0`` (default) uses
         only the analytical SE. Set to ``999`` or higher for stable
@@ -439,12 +457,14 @@ def __init__(
             raise ValueError(f"n_bootstrap must be non-negative, got {n_bootstrap}")
         if cluster is not None:
             raise NotImplementedError(
-                f"cluster={cluster!r}: custom clustering is not supported in "
-                f"Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters "
-                f"at the group level via the cohort-recentered influence-"
-                f"function plug-in (analytical SEs) and the multiplier "
-                f"bootstrap (also grouped at the group column). To use the "
-                f"supported group-level clustering, pass cluster=None (the "
+                f"cluster={cluster!r}: user-specified clustering is not "
+                f"supported in ChaisemartinDHaultfoeuille. dCDH clusters at "
+                f"the group level by default via the cohort-recentered "
+                f"influence-function plug-in (analytical SEs) and the "
+                f"multiplier bootstrap. Under survey_design with strictly-"
+                f"coarser PSUs, bootstrap clustering automatically upgrades "
+                f"to PSU-level Hall-Mammen wild. To use the default path, "
+                f"pass cluster=None (the "
                 f"default). Custom clustering is reserved for a future "
                 f"phase. See REGISTRY.md ChaisemartinDHaultfoeuille section "
                 f"for the full contract."
@@ -510,11 +530,13 @@ def set_params(self, **params: Any) -> "ChaisemartinDHaultfoeuille":
             raise ValueError(f"n_bootstrap must be non-negative, got {self.n_bootstrap}")
         if self.cluster is not None:
             raise NotImplementedError(
-                f"cluster={self.cluster!r}: custom clustering is not supported "
-                f"in Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters "
-                f"at the group level. To use the supported group-level "
-                f"clustering, pass cluster=None (the default). Custom clustering "
-                f"is reserved for a future phase. See REGISTRY.md "
+                f"cluster={self.cluster!r}: user-specified clustering is "
+                f"not supported in ChaisemartinDHaultfoeuille. dCDH clusters "
+                f"at the group level by default; under survey_design with "
+                f"strictly-coarser PSUs the bootstrap automatically upgrades "
+                f"to PSU-level Hall-Mammen wild clustering. Pass cluster=None "
+                f"(the default) to use this path. User-specified custom "
+                f"clustering is reserved for a future phase. See REGISTRY.md "
                 f"ChaisemartinDHaultfoeuille section for the full contract."
             )
         return self
@@ -615,11 +637,25 @@ def fit(
             ``drop_larger_lower=False`` to retain 2-switch groups.
         survey_design : SurveyDesign, optional
             Survey design specification for design-based inference.
-            Supports pweight with strata/PSU/FPC via Taylor Series
-            Linearization. Survey weights produce weighted cell means
-            for the point estimate; the IF-based variance accounts for
-            the survey design structure. Replicate weights are not yet
-            supported.
+            Supports ``weight_type='pweight'`` with two variance paths:
+            (1) Taylor Series Linearization using strata / PSU / FPC
+            (analytical) and (2) replicate-weight variance using
+            BRR / Fay / JK1 / JKn / SDR methods (analytical,
+            closed-form). Survey weights produce weighted cell means
+            for the point estimate. Under a survey design without an
+            explicit ``psu``, ``fit()`` auto-injects ``psu=<group_col>``
+            so the group is the effective sampling unit (strata / PSU
+            must be within-group-constant; mixed labels within a group
+            raise ``ValueError``). When ``n_bootstrap > 0`` and a
+            survey design is supplied, the multiplier bootstrap
+            operates at the PSU level (Hall-Mammen wild PSU
+            bootstrap) — under the default auto-inject this collapses
+            to a group-level clustered bootstrap. **Replicate weights
+            combined with ``n_bootstrap > 0`` are rejected** with
+            ``NotImplementedError`` (replicate variance is
+            closed-form; bootstrap would double-count variance). See
+            REGISTRY.md ``ChaisemartinDHaultfoeuille`` Notes for the
+            full contract.
 
         Returns
         -------
@@ -725,13 +761,15 @@ def fit(
                     f"weight_type='pweight', got '{resolved_survey.weight_type}'. "
                     f"The survey IF variance math assumes probability weights."
                 )
-            if (resolved_survey.replicate_weights is not None
-                    and resolved_survey.replicate_weights.shape[1] > 0):
-                raise NotImplementedError(
-                    "Replicate weight variance for dCDH is not yet supported. "
-                    "Use strata/PSU/FPC for design-based inference via Taylor "
-                    "Series Linearization."
-                )
+            # Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are supported
+            # for analytical variance via compute_replicate_if_variance()
+            # at each IF site (see _survey_se_from_group_if). The combination
+            # of replicate weights and n_bootstrap > 0 is rejected inside
+            # the bootstrap entry block (replicate variance is closed-form;
+            # bootstrap would double-count variance). Matches library
+            # precedent: efficient_did.py:989, staggered.py:1869,
+            # two_stage.py:251-253.
+
             # Within-group-constant PSU/strata is required: the IF
             # expansion psi_i = U[g] * (w_i / W_g) supports within-group
             # variation in WEIGHTS (each obs contributes proportionally),
@@ -778,6 +816,16 @@ def fit(
                 "resolved": resolved_survey,
             }
 
+        # Replicate-weight variance tracker: each _compute_se call under
+        # a replicate design returns n_valid (number of replicate columns
+        # that produced a finite estimate). The effective df_survey is
+        # min(n_valid) - 1 across all IF sites — matches the precedent in
+        # `diff_diff/efficient_did.py:1133-1135` and
+        # `diff_diff/triple_diff.py:676-686`. Under TSL (analytical),
+        # _compute_se returns None for n_valid and df_survey falls through
+        # to resolved_survey.df_survey (= n_psu - n_strata).
+        _replicate_n_valid_list: List[int] = []
+
         # ------------------------------------------------------------------
         # Step 4b: Covariate aggregation (DID^X, Web Appendix Section 1.2)
         # ------------------------------------------------------------------
@@ -1600,17 +1648,19 @@ def fit(
                 U_centered_l = _cohort_recenter(U_l_elig, cid_elig)
                 N_l_h = multi_horizon_dids[l_h]["N_l"]
                 _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]]
-                se_l = _compute_se(
+                se_l, n_valid_l = _compute_se(
                     U_centered=U_centered_l,
                     divisor=N_l_h,
                     obs_survey_info=_obs_survey_info,
                     eligible_groups=_elig_groups_l,
                 )
+                if n_valid_l is not None:
+                    _replicate_n_valid_list.append(n_valid_l)
                 multi_horizon_se[l_h] = se_l
 
                 did_l_val = multi_horizon_dids[l_h]["did_l"]
-                _df_s = resolved_survey.df_survey if resolved_survey is not None else None
-                t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_df_s)
+                _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
+                t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_inference_df(_df_s, resolved_survey))
                 multi_horizon_inference[l_h] = {
                     "effect": did_l_val,
                     "se": se_l,
@@ -1727,17 +1777,20 @@ def fit(
                     cid_elig_pl = cid_pl[eligible_mask_pl]
                     U_centered_pl_l = _cohort_recenter(U_pl_elig, cid_elig_pl)
                     _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]]
-                    se_pl_l = _compute_se(
+                    se_pl_l, n_valid_pl_l = _compute_se(
                         U_centered=U_centered_pl_l,
                         divisor=pl_data["N_pl_l"],
                         obs_survey_info=_obs_survey_info,
                         eligible_groups=_elig_groups_pl,
                     )
+                    if n_valid_pl_l is not None:
+                        _replicate_n_valid_list.append(n_valid_pl_l)
                     placebo_horizon_se[lag_l] = se_pl_l
                     pl_val = pl_data["placebo_l"]
-                    _df_s = resolved_survey.df_survey if resolved_survey is not None else None
+                    _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
                     t_pl_l, p_pl_l, ci_pl_l = safe_inference(
-                        pl_val, se_pl_l, alpha=self.alpha, df=_df_s
+                        pl_val, se_pl_l, alpha=self.alpha,
+                        df=_inference_df(_df_s, resolved_survey),
                     )
                     placebo_horizon_inference[lag_l] = {
                         "effect": pl_val,
@@ -1810,12 +1863,14 @@ def fit(
         )
 
         # Analytical SE for DID_M (survey-aware when survey_design provided)
-        overall_se = _compute_se(
+        overall_se, n_valid_overall = _compute_se(
             U_centered=U_centered_overall,
             divisor=N_S,
             obs_survey_info=_obs_survey_info,
             eligible_groups=_eligible_group_ids,
         )
+        if n_valid_overall is not None:
+            _replicate_n_valid_list.append(n_valid_overall)
         # Detect the degenerate-cohort case: every variance-eligible group
         # forms its own (D_{g,1}, F_g, S_g) cohort, so the centered
         # influence function is identically zero and `_plugin_se` returns
@@ -1840,23 +1895,26 @@ def fit(
                 UserWarning,
                 stacklevel=2,
             )
-        _df_survey = (
-            resolved_survey.df_survey if resolved_survey is not None else None
-        )
+        _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
         overall_t, overall_p, overall_ci = safe_inference(
-            overall_att, overall_se, alpha=self.alpha, df=_df_survey
+            overall_att, overall_se, alpha=self.alpha,
+            df=_inference_df(_df_survey, resolved_survey),
         )
 
         # Joiners SE (uses joiner-only centered IF; conservative bound)
         if joiners_available:
-            joiners_se = _compute_se(
+            joiners_se, n_valid_joiners = _compute_se(
                 U_centered=U_centered_joiners,
                 divisor=joiner_total,
                 obs_survey_info=_obs_survey_info,
                 eligible_groups=_eligible_group_ids,
             )
+            if n_valid_joiners is not None:
+                _replicate_n_valid_list.append(n_valid_joiners)
+            _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
             joiners_t, joiners_p, joiners_ci = safe_inference(
-                joiners_att, joiners_se, alpha=self.alpha, df=_df_survey
+                joiners_att, joiners_se, alpha=self.alpha,
+                df=_inference_df(_df_survey, resolved_survey),
             )
         else:
             joiners_se, joiners_t, joiners_p, joiners_ci = (
@@ -1868,14 +1926,18 @@ def fit(
 
         # Leavers SE
         if leavers_available:
-            leavers_se = _compute_se(
+            leavers_se, n_valid_leavers = _compute_se(
                 U_centered=U_centered_leavers,
                 divisor=leaver_total,
                 obs_survey_info=_obs_survey_info,
                 eligible_groups=_eligible_group_ids,
             )
+            if n_valid_leavers is not None:
+                _replicate_n_valid_list.append(n_valid_leavers)
+            _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list)
             leavers_t, leavers_p, leavers_ci = safe_inference(
-                leavers_att, leavers_se, alpha=self.alpha, df=_df_survey
+                leavers_att, leavers_se, alpha=self.alpha,
+                df=_inference_df(_df_survey, resolved_survey),
             )
         else:
             leavers_se, leavers_t, leavers_p, leavers_ci = (
@@ -1916,14 +1978,81 @@ def fit(
         bootstrap_results: Optional[DCDHBootstrapResults] = None
         if self.n_bootstrap > 0:
             if resolved_survey is not None:
-                warnings.warn(
-                    "Bootstrap with survey_design uses group-level multiplier "
-                    "weights, not PSU-level. This is conservative when groups "
-                    "are finer than PSUs. PSU-level survey bootstrap is "
-                    "deferred to a future release.",
-                    UserWarning,
-                    stacklevel=2,
-                )
+                # Replicate-weight designs have their own closed-form
+                # variance (Rao-Wu / BRR / Fay / JK1 / JKn / SDR via
+                # compute_replicate_if_variance). Combining replicate
+                # variance with a multiplier bootstrap would double-count
+                # variance. Match library precedent:
+                # efficient_did.py:989, staggered.py:1869,
+                # two_stage.py:251-253.
+                if (
+                    resolved_survey.replicate_weights is not None
+                    and resolved_survey.replicate_weights.shape[1] > 0
+                ):
+                    raise NotImplementedError(
+                        "dCDH survey support rejects the combination of "
+                        "replicate weights and n_bootstrap > 0. Replicate-"
+                        "weight variance is closed-form "
+                        "(compute_replicate_if_variance); use n_bootstrap=0. "
+                        "For a bootstrap-based SE, use strata/PSU/FPC design "
+                        "instead of replicate weights."
+                    )
+
+                # Warning fires only when PSU is strictly coarser than
+                # group. Under auto-inject psu=group (or explicit
+                # psu=<group_col>), groups and PSUs coincide so
+                # Hall-Mammen wild PSU bootstrap equals group-level
+                # multiplier bootstrap — no need to warn.
+                # Count groups/PSUs on the POST-FILTER eligible set
+                # (`_eligible_group_ids`) — the same set the bootstrap
+                # map is built from below. Using raw positive-weight
+                # groups here would emit a misleading warning on
+                # panels where upstream dCDH filtering drops groups
+                # that happen to share PSUs with kept groups.
+                psu_arr_warn = getattr(resolved_survey, "psu", None)
+                if psu_arr_warn is None or _obs_survey_info is None:
+                    # No PSU info — can't compare to group count.
+                    n_psu_eff_warn, n_groups_eff_warn = -1, -1
+                else:
+                    obs_gids_warn = np.asarray(_obs_survey_info["group_ids"])
+                    obs_ws_warn = np.asarray(
+                        _obs_survey_info["weights"], dtype=np.float64
+                    )
+                    pos_mask_warn = obs_ws_warn > 0
+                    psu_codes_warn = np.asarray(psu_arr_warn)
+                    # Collect the PSU label for each variance-eligible
+                    # group (within-group-constant PSU is validated
+                    # upstream, so the first positive-weight label
+                    # represents the whole group).
+                    eligible_psu_labels: List[Any] = []
+                    for gid in _eligible_group_ids:
+                        mask_g = (obs_gids_warn == gid) & pos_mask_warn
+                        if mask_g.any():
+                            eligible_psu_labels.append(
+                                psu_codes_warn[mask_g][0]
+                            )
+                    n_groups_eff_warn = len(eligible_psu_labels)
+                    n_psu_eff_warn = (
+                        int(len(np.unique(np.asarray(eligible_psu_labels))))
+                        if eligible_psu_labels
+                        else -1
+                    )
+                if 0 <= n_psu_eff_warn < n_groups_eff_warn:
+                    warnings.warn(
+                        f"Bootstrap with survey_design uses Hall-Mammen "
+                        f"wild multiplier weights at the PSU level "
+                        f"(n_psu={n_psu_eff_warn} PSUs across "
+                        f"n_groups={n_groups_eff_warn} groups). For "
+                        f"designs with substantially unequal PSU sizes, "
+                        f"the wild bootstrap may under-cover relative to "
+                        f"analytical TSL inference; consider "
+                        f"n_bootstrap=0 for the TSL variance. If n_psu "
+                        f"is close to n_groups, lonely-PSU removal "
+                        f"(lonely_psu='remove') may be collapsing "
+                        f"singletons.",
+                        UserWarning,
+                        stacklevel=2,
+                    )
             joiners_inputs = (
                 (U_centered_joiners, joiner_total, joiners_att) if joiners_available else None
             )
@@ -2039,6 +2168,52 @@ def fit(
                         h_data["did_l"],
                     )
 
+            # Under a survey design with PSU information, build a
+            # `group_id_to_psu_code` dict so the bootstrap mixin can
+            # derive each target's PSU map by ID lookup rather than by
+            # positional reuse. PSU/strata are within-group-constant by
+            # the _validate_group_constant_strata_psu precondition, so
+            # each variance-eligible group has exactly one PSU label.
+            # Under auto-inject psu=group each group has a unique PSU
+            # code and the bootstrap mixin's identity-map fast path
+            # reproduces the pre-PSU behavior bit-for-bit.
+            group_id_to_psu_code_bootstrap: Optional[Dict[Any, int]] = None
+            eligible_group_ids_bootstrap: Optional[np.ndarray] = None
+            if (
+                resolved_survey is not None
+                and getattr(resolved_survey, "psu", None) is not None
+                and _obs_survey_info is not None
+            ):
+                obs_psu_codes = np.asarray(resolved_survey.psu)
+                obs_gids_boot = np.asarray(_obs_survey_info["group_ids"])
+                obs_weights_boot = np.asarray(
+                    _obs_survey_info["weights"], dtype=np.float64
+                )
+                pos_mask_boot = obs_weights_boot > 0
+                group_psu_labels: List[Any] = []
+                valid_map = True
+                for gid in _eligible_group_ids:
+                    mask_g = (obs_gids_boot == gid) & pos_mask_boot
+                    if not mask_g.any():
+                        valid_map = False
+                        break
+                    labels = obs_psu_codes[mask_g]
+                    # Within-group-constant PSU is validated upstream;
+                    # the first label represents the whole group.
+                    group_psu_labels.append(labels[0])
+                if valid_map and len(group_psu_labels) == n_groups_for_overall_var:
+                    # Factor PSU labels to dense integer codes.
+                    _, dense_codes = np.unique(
+                        np.asarray(group_psu_labels),
+                        return_inverse=True,
+                    )
+                    dense_codes = np.asarray(dense_codes, dtype=np.int64)
+                    group_id_to_psu_code_bootstrap = {
+                        gid: int(code)
+                        for gid, code in zip(_eligible_group_ids, dense_codes)
+                    }
+                    eligible_group_ids_bootstrap = np.asarray(_eligible_group_ids)
+
             br = self._compute_dcdh_bootstrap(
                 n_groups_for_overall=n_groups_for_overall_var,
                 u_centered_overall=U_centered_overall,
@@ -2049,6 +2224,8 @@ def fit(
                 placebo_inputs=placebo_inputs,
                 multi_horizon_inputs=mh_boot_inputs,
                 placebo_horizon_inputs=pl_boot_inputs,
+                group_id_to_psu_code=group_id_to_psu_code_bootstrap,
+                eligible_group_ids=eligible_group_ids_bootstrap,
             )
             bootstrap_results = br
 
@@ -2077,17 +2254,17 @@ def fit(
                 overall_se = br.overall_se
                 overall_p = br.overall_p_value if br.overall_p_value is not None else np.nan
                 overall_ci = br.overall_ci if br.overall_ci is not None else (np.nan, np.nan)
-                overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_df_survey)[0]
+                overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0]
             if joiners_available and br.joiners_se is not None and np.isfinite(br.joiners_se):
                 joiners_se = br.joiners_se
                 joiners_p = br.joiners_p_value if br.joiners_p_value is not None else np.nan
                 joiners_ci = br.joiners_ci if br.joiners_ci is not None else (np.nan, np.nan)
-                joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_df_survey)[0]
+                joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0]
             if leavers_available and br.leavers_se is not None and np.isfinite(br.leavers_se):
                 leavers_se = br.leavers_se
                 leavers_p = br.leavers_p_value if br.leavers_p_value is not None else np.nan
                 leavers_ci = br.leavers_ci if br.leavers_ci is not None else (np.nan, np.nan)
-                leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_df_survey)[0]
+                leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0]
 
         # ------------------------------------------------------------------
         # Step 20: Build the results dataclass
@@ -2249,7 +2426,8 @@ def fit(
                 if np.isfinite(delta_se):
                     effective_overall_se = delta_se
                     effective_overall_t, effective_overall_p, effective_overall_ci = safe_inference(
-                        delta_val, delta_se, alpha=self.alpha, df=_df_survey
+                        delta_val, delta_se, alpha=self.alpha,
+                        df=_inference_df(_df_survey, resolved_survey),
                     )
                 else:
                     effective_overall_se = float("nan")
@@ -2283,7 +2461,8 @@ def fit(
                         # Fallback: NaN SE (Phase 1 path or missing IF)
                         pl_se = float("nan")
                         pl_t, pl_p, pl_ci = safe_inference(
-                            pl_data["placebo_l"], pl_se, alpha=self.alpha, df=_df_survey
+                            pl_data["placebo_l"], pl_se, alpha=self.alpha,
+                            df=_inference_df(_df_survey, resolved_survey),
                         )
                         placebo_event_study_dict[-lag_l] = {
                             "effect": pl_data["placebo_l"],
@@ -2334,7 +2513,8 @@ def fit(
                             bs_ci if bs_ci is not None else (np.nan, np.nan)
                         )
                         placebo_event_study_dict[neg_key]["t_stat"] = safe_inference(
-                            eff, bs_se, alpha=self.alpha, df=_df_survey
+                            eff, bs_se, alpha=self.alpha,
+                            df=_inference_df(_df_survey, resolved_survey),
                         )[0]
 
         # Phase 2: build normalized_effects with SE
@@ -2347,7 +2527,7 @@ def fit(
                 # SE via delta method: SE(DID^n_l) = SE(DID_l) / delta^D_l
                 se_did_l = multi_horizon_se.get(l_h, float("nan"))
                 se_norm = se_did_l / denom if np.isfinite(denom) and denom > 0 else float("nan")
-                t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_df_survey)
+                t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))
                 normalized_effects_out[l_h] = {
                     "effect": eff,
                     "se": se_norm,
@@ -2406,7 +2586,8 @@ def fit(
                 else:
                     running_se_ub = float("nan")
                 cum_t, cum_p, cum_ci = safe_inference(
-                    cum_effect, running_se_ub, alpha=self.alpha, df=_df_survey
+                    cum_effect, running_se_ub, alpha=self.alpha,
+                    df=_inference_df(_df_survey, resolved_survey),
                 )
                 cumulated[l_h] = {
                     "effect": cum_effect,
@@ -2507,6 +2688,7 @@ def fit(
                 rank_deficient_action=self.rank_deficient_action,
                 group_ids_order=np.array(all_groups),
                 obs_survey_info=_obs_survey_info,
+                replicate_n_valid_list=_replicate_n_valid_list,
             )
 
         twfe_weights_df = None
@@ -2559,6 +2741,115 @@ def fit(
             effective_joiners_available = False
             effective_leavers_available = False
 
+        # R2 P1b: Finalize replicate-df propagation across public
+        # surfaces. When heterogeneity is active, its n_valid is
+        # appended to `_replicate_n_valid_list` AFTER the main
+        # surfaces (overall / joiners / leavers / event study /
+        # placebo horizon) have already computed their t/p/CI fields
+        # with an intermediate `_df_survey`. If heterogeneity reports
+        # the smallest n_valid, the main-surface inference would be
+        # anti-conservative relative to `survey_metadata.df_survey`
+        # and HonestDiD. Re-run safe_inference with the FINAL
+        # effective df so every surface agrees.
+        _final_eff_df = _effective_df_survey(
+            resolved_survey, _replicate_n_valid_list
+        )
+        if _replicate_n_valid_list:
+            _final_inf_df = _inference_df(_final_eff_df, resolved_survey)
+            # Recompute `effective_overall_*` directly — that's what
+            # ships in results at line ~2776+. `effective_overall_att/se`
+            # may differ from the raw `overall_att/se` under the delta
+            # (cost-benefit) path at L_max >= 2; recomputing from
+            # `effective_*` ensures both paths get the final df.
+            if np.isfinite(effective_overall_se):
+                effective_overall_t, effective_overall_p, effective_overall_ci = (
+                    safe_inference(
+                        effective_overall_att, effective_overall_se,
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                )
+            # Keep `overall_*` in sync for any downstream code that
+            # reads them directly (e.g., placebo_event_study_dict
+            # construction flows from overall_*).
+            overall_t, overall_p, overall_ci = safe_inference(
+                overall_att, overall_se,
+                alpha=self.alpha, df=_final_inf_df,
+            )
+            if joiners_available:
+                joiners_t, joiners_p, joiners_ci = safe_inference(
+                    joiners_att, joiners_se,
+                    alpha=self.alpha, df=_final_inf_df,
+                )
+            if leavers_available:
+                leavers_t, leavers_p, leavers_ci = safe_inference(
+                    leavers_att, leavers_se,
+                    alpha=self.alpha, df=_final_inf_df,
+                )
+            if multi_horizon_inference is not None:
+                for _lag_r2, _info_r2 in list(multi_horizon_inference.items()):
+                    _t_r2, _p_r2, _ci_r2 = safe_inference(
+                        _info_r2["effect"], _info_r2["se"],
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                    _info_r2["t_stat"] = _t_r2
+                    _info_r2["p_value"] = _p_r2
+                    _info_r2["conf_int"] = _ci_r2
+            if placebo_horizon_inference is not None:
+                for _lag_r2, _info_r2 in list(placebo_horizon_inference.items()):
+                    _t_r2, _p_r2, _ci_r2 = safe_inference(
+                        _info_r2["effect"], _info_r2["se"],
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                    _info_r2["t_stat"] = _t_r2
+                    _info_r2["p_value"] = _p_r2
+                    _info_r2["conf_int"] = _ci_r2
+                    # `placebo_event_study_dict` holds VALUE copies
+                    # (not shared references) of the inner dicts, so
+                    # mutating `placebo_horizon_inference[lag]` above
+                    # does NOT propagate to the public surface. Update
+                    # the negative-key mirror explicitly so the
+                    # recomputed t/p/CI ship in results.
+                    if (
+                        placebo_event_study_dict is not None
+                        and -_lag_r2 in placebo_event_study_dict
+                    ):
+                        placebo_event_study_dict[-_lag_r2]["t_stat"] = _t_r2
+                        placebo_event_study_dict[-_lag_r2]["p_value"] = _p_r2
+                        placebo_event_study_dict[-_lag_r2]["conf_int"] = _ci_r2
+            if heterogeneity_effects:
+                for _lag_r2, _info_r2 in list(heterogeneity_effects.items()):
+                    if np.isfinite(_info_r2["se"]):
+                        _t_r2, _p_r2, _ci_r2 = safe_inference(
+                            _info_r2["beta"], _info_r2["se"],
+                            alpha=self.alpha, df=_final_inf_df,
+                        )
+                        _info_r2["t_stat"] = _t_r2
+                        _info_r2["p_value"] = _p_r2
+                        _info_r2["conf_int"] = _ci_r2
+            # Normalized effects: another public surface built with the
+            # pre-heterogeneity `_df_survey`. Recompute inference with
+            # the final df so t/p/CI match the other surfaces (and the
+            # NaN contract when the final df becomes undefined).
+            if normalized_effects_out is not None:
+                for _lag_r2, _info_r2 in list(normalized_effects_out.items()):
+                    _t_r2, _p_r2, _ci_r2 = safe_inference(
+                        _info_r2["effect"], _info_r2["se"],
+                        alpha=self.alpha, df=_final_inf_df,
+                    )
+                    _info_r2["t_stat"] = _t_r2
+                    _info_r2["p_value"] = _p_r2
+                    _info_r2["conf_int"] = _ci_r2
+
+        # Persist the final effective df_survey into survey_metadata so
+        # downstream consumers — HonestDiD bounds (honest_did.py:973
+        # reads results.survey_metadata.df_survey), exported metadata,
+        # and users — all see the same df that the recomputed
+        # inference above used. SurveyMetadata is a mutable @dataclass
+        # (diff_diff/survey.py:681), so direct attribute assignment is
+        # safe.
+        if survey_metadata is not None:
+            survey_metadata.df_survey = _final_eff_df
+
         results = ChaisemartinDHaultfoeuilleResults(
             overall_att=effective_overall_att,
             overall_se=effective_overall_se,
@@ -3289,6 +3580,7 @@ def _compute_heterogeneity_test(
     rank_deficient_action: str = "warn",
     group_ids_order: Optional[np.ndarray] = None,
     obs_survey_info: Optional[Dict[str, Any]] = None,
+    replicate_n_valid_list: Optional[List[int]] = None,
 ) -> Dict[int, Dict[str, Any]]:
     """Test for heterogeneous treatment effects (Web Appendix Section 1.5).
 
@@ -3314,8 +3606,21 @@ def _compute_heterogeneity_test(
         Observation-level survey info with keys ``group_ids`` (raw per-row
         group labels), ``weights`` (per-row survey weights), and ``resolved``
         (ResolvedSurveyDesign). When provided, the regression uses WLS with
-        per-group weights W_g = sum of obs survey weights, and SE is computed
-        via Binder TSL IF expansion through ``compute_survey_if_variance``.
+        per-group weights W_g = sum of obs survey weights. SE is computed
+        via Binder TSL IF expansion through ``compute_survey_if_variance``
+        by default; under a replicate-weight design (BRR/Fay/JK1/JKn/SDR),
+        dispatches to ``compute_replicate_if_variance`` for Rao-Wu-style
+        variance. The effective df for t-critical values follows the
+        site-level ``min(df_s, n_valid_het - 1)`` rule and the helper
+        mutates ``replicate_n_valid_list`` so the final
+        ``_effective_df_survey(...)`` sees this site's n_valid.
+    replicate_n_valid_list : list[int], optional
+        Shared accumulator for replicate-weight ``n_valid`` counts across
+        IF sites. When provided and a replicate design is in use, this
+        function appends its own ``n_valid_het`` before computing the
+        local effective df — so both the local inference fields and the
+        final ``survey_metadata.df_survey`` (set by ``fit()``) reflect
+        this site's contribution.
 
     Returns
     -------
@@ -3334,13 +3639,25 @@ def _compute_heterogeneity_test(
         obs_survey_info is not None and group_ids_order is not None
     )
     if use_survey:
-        from diff_diff.survey import compute_survey_if_variance
+        from diff_diff.survey import (
+            compute_replicate_if_variance,
+            compute_survey_if_variance,
+        )
 
         obs_gids_raw = np.asarray(obs_survey_info["group_ids"])
         obs_w_raw = np.asarray(obs_survey_info["weights"], dtype=np.float64)
         resolved = obs_survey_info["resolved"]
-        df_s = (
-            resolved.df_survey if resolved is not None else None
+        # df_s starts from the shared effective df (base-capped by
+        # design df). Under replicate designs the helper handles the
+        # min(resolved.df_survey, min(n_valid) - 1) reduction and
+        # preserves None when the base df is undefined (QR-rank ≤ 1).
+        # Using the helper here — rather than re-deriving locally —
+        # keeps heterogeneity's df consistent with the main dCDH
+        # surfaces (R2 P1a). `list(... or [])` avoids accidental
+        # mutation of the caller's shared tracker at this site; the
+        # explicit append happens inside the horizon loop below.
+        df_s = _effective_df_survey(
+            resolved, list(replicate_n_valid_list or [])
         )
         # Contract: only obs whose group is in the canonical post-filter
         # list contribute. Groups dropped upstream (Step 5b interior gaps,
@@ -3495,15 +3812,42 @@ def _compute_heterogeneity_test(
                         obs_w_raw[mask_g] / w_sum_g
                     )
 
-            # Binder TSL variance across stratified PSUs.
-            var_s = compute_survey_if_variance(psi_obs, resolved)
+            # Dispatch: replicate-weight variance (BRR/Fay/JK1/JKn/SDR)
+            # vs Binder TSL across stratified PSUs. Mirrors the inline
+            # branch in _survey_se_from_group_if and the pattern in
+            # TripleDifference:1206-1238. Heterogeneity uses WLS with
+            # full-sample weights; theta_hat is treated as fixed per the
+            # FWL plug-in IF convention (REGISTRY.md Note on heterogeneity
+            # under replicate — no per-replicate refits).
+            if getattr(resolved, "uses_replicate_variance", False):
+                var_s, n_valid_het = compute_replicate_if_variance(
+                    psi_obs, resolved
+                )
+                if replicate_n_valid_list is not None:
+                    replicate_n_valid_list.append(n_valid_het)
+                # Reduce df_s to reflect this horizon's n_valid.
+                # R2 P1a: when the shared base-capped df_s is None
+                # (undefined base df, e.g., QR-rank ≤ 1), the
+                # heterogeneity df MUST stay None — per-site n_valid
+                # cannot rescue a rank-deficient design. The
+                # _inference_df wrapper at the safe_inference call
+                # below coerces None to 0 under replicate, forcing
+                # NaN inference.
+                if df_s is None:
+                    df_s_local = None
+                else:
+                    df_s_local = min(int(df_s), int(n_valid_het) - 1)
+            else:
+                var_s = compute_survey_if_variance(psi_obs, resolved)
+                df_s_local = df_s
             se_het = (
                 float(np.sqrt(var_s))
                 if np.isfinite(var_s) and var_s > 0
                 else float("nan")
             )
             t_stat, p_val, ci = safe_inference(
-                beta_het, se_het, alpha=alpha, df=df_s
+                beta_het, se_het, alpha=alpha,
+                df=_inference_df(df_s_local, resolved),
             )
 
         results[l_h] = {
@@ -4823,24 +5167,93 @@ def _validate_group_constant_strata_psu(
             )
 
 
+def _inference_df(
+    effective_df: Optional[int],
+    resolved_survey: Any,
+) -> Optional[int]:
+    """Coerce an effective-df value for use in ``safe_inference(df=...)``.
+
+    ``_effective_df_survey()`` returns ``None`` when the replicate
+    design's base df is undefined (QR-rank ≤ 1) or when the reduced
+    df would be < 1. ``safe_inference`` treats ``df=None`` as the
+    standard normal (z-inference) and only returns NaN for
+    ``df <= 0``. Under a replicate design, "undefined effective df"
+    MUST map to NaN inference per REGISTRY.md — NOT to z-inference.
+
+    Returns:
+    - ``effective_df`` when defined (t-inference with that df).
+    - ``0`` when ``effective_df is None`` AND ``resolved_survey`` uses
+      replicate variance — forces ``safe_inference`` to return NaN
+      t/p/CI via its ``df <= 0`` branch.
+    - ``None`` otherwise (no survey, or TSL design where the resolver
+      always returns an int df — z-inference is never reachable here).
+    """
+    if effective_df is not None:
+        return effective_df
+    if resolved_survey is None:
+        return None
+    if getattr(resolved_survey, "uses_replicate_variance", False):
+        return 0
+    return None
+
+
+def _effective_df_survey(
+    resolved_survey: Any,
+    replicate_n_valid_list: List[int],
+) -> Optional[int]:
+    """Compute the effective ``df_survey`` for t-critical values.
+
+    Under replicate-weight designs, each IF site returns an ``n_valid``
+    count (number of replicate columns that produced a finite estimate).
+    This helper **reduces** — never overwrites — the design-level
+    ``resolved_survey.df_survey`` (which for replicate designs is
+    ``QR-rank - 1`` per R's ``survey::degf()`` convention; see
+    ``diff_diff/survey.py:590``). Returns
+    ``min(resolved_survey.df_survey, min(n_valid) - 1)`` when both are
+    defined.
+
+    Matches the precedent in ``diff_diff/efficient_did.py:1133-1135``
+    and ``diff_diff/triple_diff.py:676-686`` (both reduce, not
+    replace). Under TSL (analytical) or non-survey fits,
+    ``replicate_n_valid_list`` is empty and this falls through to the
+    design-level df. Returns ``None`` when either the base df is
+    undefined (rank ≤ 1) or the reduced df would be < 1 — preserving
+    the NaN-inference contract from ``safe_inference``.
+    """
+    if resolved_survey is None:
+        return None
+    base_df = resolved_survey.df_survey
+    if not replicate_n_valid_list:
+        return base_df
+    reduced_df = int(min(replicate_n_valid_list)) - 1
+    if base_df is None or reduced_df < 1:
+        return None
+    return min(int(base_df), reduced_df)
+
+
 def _compute_se(
     U_centered: np.ndarray,
     divisor: int,
     obs_survey_info: Optional[dict],
     eligible_groups: Optional[list] = None,
-) -> float:
+) -> Tuple[float, Optional[int]]:
     """Dispatch to plug-in SE or survey-design-aware SE.
 
     When ``obs_survey_info`` is ``None``, falls back to the simple
     plug-in formula.  Otherwise, expands group-level IFs and delegates
-    to TSL variance.
+    to TSL variance (or replicate-weight variance when the resolved
+    design carries replicate weights).
+
+    Returns ``(se, n_valid_replicates)``. ``n_valid_replicates`` is
+    ``None`` under the plug-in / TSL paths, and the number of valid
+    replicate columns under the replicate path.
     """
     if obs_survey_info is None:
-        return _plugin_se(U_centered=U_centered, divisor=divisor)
+        return _plugin_se(U_centered=U_centered, divisor=divisor), None
     if eligible_groups is None:
-        return _plugin_se(U_centered=U_centered, divisor=divisor)
+        return _plugin_se(U_centered=U_centered, divisor=divisor), None
     if divisor <= 0:
-        return float("nan")
+        return float("nan"), None
     # dCDH IFs are numerator-scale (U.sum() == N_S * DID_M).
     # compute_survey_if_variance() expects estimator-scale psi.
     # Scale by 1/divisor to normalize before survey expansion.
@@ -4856,16 +5269,20 @@ def _survey_se_from_group_if(
     U_centered: np.ndarray,
     eligible_groups: list,
     obs_survey_info: dict,
-) -> float:
+) -> Tuple[float, Optional[int]]:
     """Compute survey-design-aware SE from group-level centered IFs.
 
     Expands group-level influence function values to observation level
     using the proportional-weight allocation ``psi_i = U[g] * (w_i / W_g)``
-    so that ``sum(psi_i for i in g) = U[g]``, then delegates to
-    ``compute_survey_if_variance()`` for the TSL design-based variance.
+    so that ``sum(psi_i for i in g) = U[g]``, then delegates to either
+    ``compute_survey_if_variance()`` (TSL / analytical path) or
+    ``compute_replicate_if_variance()`` (BRR/Fay/JK1/JKn/SDR) depending
+    on whether the resolved design carries replicate weights.
 
     This is a library extension not in the dCDH papers (the paper's
-    plug-in variance assumes iid sampling).
+    plug-in variance assumes iid sampling). The replicate path uses
+    Rao-Wu weight-ratio rescaling; see REGISTRY.md
+    ``ChaisemartinDHaultfoeuille`` Note on survey expansion.
 
     Parameters
     ----------
@@ -4880,10 +5297,18 @@ def _survey_se_from_group_if(
 
     Returns
     -------
-    float
-        Survey-design-aware SE, or NaN if degenerate.
+    Tuple[float, Optional[int]]
+        ``(se, n_valid_replicates)``. ``se`` is the survey-design-aware
+        SE, or NaN if degenerate. ``n_valid_replicates`` is the number
+        of valid replicate columns used for variance under the replicate
+        path, or ``None`` under the TSL (analytical) path. Callers
+        track the min across sites to set an effective ``df_survey``
+        for replicate designs.
     """
-    from diff_diff.survey import compute_survey_if_variance
+    from diff_diff.survey import (
+        compute_replicate_if_variance,
+        compute_survey_if_variance,
+    )
 
     # Degenerate-cohort contract (mirror _plugin_se): when the centered
     # IF is empty or every cohort is a singleton (→ recentered IF is
@@ -4893,9 +5318,9 @@ def _survey_se_from_group_if(
     # through _compute_se (overall, joiners/leavers, multi-horizon
     # ATT, placebos, normalized/cumulated, heterogeneity).
     if U_centered.size == 0:
-        return float("nan")
+        return float("nan"), None
     if float((U_centered ** 2).sum()) <= 0:
-        return float("nan")
+        return float("nan"), None
 
     group_ids = obs_survey_info["group_ids"]
     weights = obs_survey_info["weights"]
@@ -4913,7 +5338,7 @@ def _survey_se_from_group_if(
     psi = np.zeros(n_obs, dtype=np.float64)
 
     if not pos_mask.any():
-        return float("nan")
+        return float("nan"), None
 
     gids_eff = np.asarray(group_ids)[pos_mask]
     w_eff = weights_arr[pos_mask]
@@ -4933,10 +5358,20 @@ def _survey_se_from_group_if(
     safe_w = np.where(w_obs_total_eff > 0, w_obs_total_eff, 1.0)
     psi[pos_mask] = u_obs_eff * (w_eff / safe_w)
 
+    # Dispatch: replicate variance (BRR/Fay/JK1/JKn/SDR) vs TSL.
+    # Mirrors the inline branch in TripleDifference:1206-1238 and
+    # EfficientDiD:1119-1142. Under replicate, returns (variance, n_valid);
+    # the caller propagates min(n_valid) to df_survey.
+    if getattr(resolved, "uses_replicate_variance", False):
+        variance, n_valid = compute_replicate_if_variance(psi, resolved)
+        if not np.isfinite(variance) or variance < 0:
+            return float("nan"), n_valid
+        return float(np.sqrt(variance)), n_valid
+
     variance = compute_survey_if_variance(psi, resolved)
     if not np.isfinite(variance) or variance < 0:
-        return float("nan")
-    return float(np.sqrt(variance))
+        return float("nan"), None
+    return float(np.sqrt(variance)), None
 
 
 def _build_group_time_design(
@@ -5228,6 +5663,11 @@ def twowayfeweights(
         (matching ``fit(..., survey_design=sd).twfe_*``). Required to preserve
         fit-vs-helper parity under survey-backed inputs. Only
         ``weight_type='pweight'`` is supported; other types raise ValueError.
+        Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are accepted —
+        the TWFE diagnostic has no SE field on ``TWFEWeightsResult``,
+        so replicate weights only affect the cell aggregation path
+        (aggregated numbers are identical to
+        ``fit(..., survey_design=sd).twfe_*`` under the same input).
 
     Returns
     -------
@@ -5251,16 +5691,13 @@ def twowayfeweights(
                 f"The TWFE diagnostic under survey uses survey-weighted cell "
                 f"means; other weight types are not supported."
             )
-        if (
-            resolved is not None
-            and resolved.replicate_weights is not None
-            and resolved.replicate_weights.shape[1] > 0
-        ):
-            raise NotImplementedError(
-                "Replicate weight variance for twowayfeweights() is not "
-                "supported. Use strata/PSU/FPC for design-based inference "
-                "via Taylor Series Linearization (matches the fit() path)."
-            )
+        # Replicate-weight designs are accepted. TWFE diagnostic has no
+        # SE field on TWFEWeightsResult — the replicate weights only
+        # participate through the full-sample weight (resolved.weights),
+        # which drives survey-weighted cell aggregation in
+        # _validate_and_aggregate_to_cells. Diagnostic numbers
+        # (beta_fe, sigma_fe, fraction_negative) match
+        # fit(..., survey_design=sd).twfe_* under replicate input.
 
     # Validation + cell aggregation via the same helper used by
     # ChaisemartinDHaultfoeuille.fit() — enforces NaN/binary/within-cell
diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
index 7eea75cf..27552296 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py
@@ -4,11 +4,18 @@
 
 The dCDH papers prescribe only the analytical cohort-recentered plug-in
 variance from Web Appendix Section 3.7.3 of the dynamic companion paper.
-This module adds an opt-in multiplier bootstrap clustered at the group
-level, matching the inference convention used by ``CallawaySantAnna``,
-``ImputationDiD``, and ``TwoStageDiD``. The bootstrap is a library
-extension, not a paper requirement, and is documented as such in
-``REGISTRY.md``.
+This module adds an opt-in multiplier bootstrap, clustered at the group
+level by default (matching the inference convention used by
+``CallawaySantAnna``, ``ImputationDiD``, and ``TwoStageDiD``). Under
+``survey_design`` with an explicitly-coarser PSU, the bootstrap switches
+to PSU-level Hall-Mammen wild clustering: each PSU draws a single
+multiplier and all groups within that PSU share it
+(see ``_generate_psu_or_group_weights`` and ``_map_for_target`` below,
+plus the REGISTRY.md ``ChaisemartinDHaultfoeuille`` Note on survey +
+bootstrap). Under the default auto-inject ``psu=group`` each group is
+its own PSU and the identity-map fast path reproduces the original
+group-level behavior bit-for-bit. The bootstrap is a library extension,
+not a paper requirement, and is documented as such in ``REGISTRY.md``.
 
 The mixin operates on **pre-computed cohort-centered influence-function
 values**: the main estimator class computes per-group ``U^G_g`` values
@@ -19,7 +26,7 @@
 produce a bootstrap distribution per target.
 """
 
-from typing import TYPE_CHECKING, Dict, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
 import numpy as np
 
@@ -70,6 +77,9 @@ def _compute_dcdh_bootstrap(
         # --- Phase 2: multi-horizon inputs ---
         multi_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None,
         placebo_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None,
+        # --- Survey: PSU-level bootstrap under survey designs ---
+        group_id_to_psu_code: Optional[Dict[Any, int]] = None,
+        eligible_group_ids: Optional[np.ndarray] = None,
     ) -> DCDHBootstrapResults:
         """
         Compute multiplier-bootstrap inference for all dCDH targets.
@@ -160,6 +170,14 @@ def _compute_dcdh_bootstrap(
             )
         rng = np.random.default_rng(self.seed)
 
+        # PSU label for each bootstrap weight column is derived from
+        # the group's ID via `_map_for_target`, not from positional
+        # truncation. All current dCDH bootstrap targets use the
+        # variance-eligible group ordering (`eligible_group_ids`); if a
+        # future target uses a different ordering, add a dedicated
+        # group-IDs parameter for it rather than reusing the overall
+        # eligible list.
+
         # --- Overall DID_M ---
         # Skip the scalar DID_M bootstrap when divisor_overall <= 0
         # (e.g., pure non-binary panels where N_S=0), but continue
@@ -175,6 +193,11 @@ def _compute_dcdh_bootstrap(
                 rng=rng,
                 context="dCDH overall DID_M bootstrap",
                 return_distribution=True,
+                group_to_psu_map=_map_for_target(
+                    u_centered_overall.shape[0],
+                    group_id_to_psu_code,
+                    eligible_group_ids,
+                ),
             )
         else:
             overall_se = np.nan
@@ -206,6 +229,9 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH joiners DID_+ bootstrap",
                     return_distribution=False,
+                    group_to_psu_map=_map_for_target(
+                        u_j.size, group_id_to_psu_code, eligible_group_ids,
+                    ),
                 )
                 results.joiners_se = se_j
                 results.joiners_ci = ci_j
@@ -225,6 +251,9 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH leavers DID_- bootstrap",
                     return_distribution=False,
+                    group_to_psu_map=_map_for_target(
+                        u_l.size, group_id_to_psu_code, eligible_group_ids,
+                    ),
                 )
                 results.leavers_se = se_l
                 results.leavers_ci = ci_l
@@ -244,6 +273,9 @@ def _compute_dcdh_bootstrap(
                     rng=rng,
                     context="dCDH placebo DID_M^pl bootstrap",
                     return_distribution=False,
+                    group_to_psu_map=_map_for_target(
+                        u_pl.size, group_id_to_psu_code, eligible_group_ids,
+                    ),
                 )
                 results.placebo_se = se_pl
                 results.placebo_ci = ci_pl
@@ -259,19 +291,47 @@ def _compute_dcdh_bootstrap(
             es_pvals: Dict[int, float] = {}
             es_dists: Dict[int, np.ndarray] = {}
 
-            # Shared weight matrix sized for the group set
+            # Shared weight matrix sized for the group set. Under PSU-level
+            # bootstrap (Hall-Mammen wild PSU), weights are drawn once per
+            # PSU and broadcast to groups so all groups in the same PSU
+            # share a multiplier within a single bootstrap replicate —
+            # preserving the sup-t joint distribution across horizons.
             n_groups_mh = n_groups_for_overall
-            shared_weights = _generate_bootstrap_weights_batch(
+            shared_weights = _generate_psu_or_group_weights(
                 n_bootstrap=self.n_bootstrap,
-                n_units=n_groups_mh,
+                n_groups_target=n_groups_mh,
                 weight_type=self.bootstrap_weights,
                 rng=rng,
+                group_to_psu_map=_map_for_target(
+                    n_groups_mh, group_id_to_psu_code, eligible_group_ids,
+                ),
             )
 
             for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()):
                 if u_h.size > 0 and n_h > 0:
-                    # Use the shared weight matrix truncated to u_h length
-                    w_h = shared_weights[:, : u_h.size]
+                    # Under the current contract every horizon's IF
+                    # vector uses the variance-eligible group ordering
+                    # from `eligible_group_ids`, so the shared weight
+                    # matrix is already at the right shape. Assert
+                    # this invariant so any future refactor that
+                    # introduces horizon-specific masking fails loudly
+                    # rather than silently misaligning PSU clusters via
+                    # positional truncation.
+                    if u_h.size != n_groups_mh:
+                        raise ValueError(
+                            f"Multi-horizon bootstrap: horizon {l_h} "
+                            f"IF vector has {u_h.size} entries but "
+                            f"shared weight matrix has {n_groups_mh} "
+                            f"columns. dCDH's contract requires every "
+                            f"horizon to use the variance-eligible "
+                            f"group ordering; to support a horizon "
+                            f"with a different ordering, thread "
+                            f"target-specific group IDs through "
+                            f"`multi_horizon_inputs` and project the "
+                            f"shared PSU draws onto the horizon's own "
+                            f"ordering via `_map_for_target`."
+                        )
+                    w_h = shared_weights
                     deviations = (w_h @ u_h) / n_h
                     dist_h = deviations + eff_h
 
@@ -324,6 +384,9 @@ def _compute_dcdh_bootstrap(
                         rng=rng,
                         context=f"dCDH placebo l={l_h} bootstrap",
                         return_distribution=False,
+                        group_to_psu_map=_map_for_target(
+                            u_h.size, group_id_to_psu_code, eligible_group_ids,
+                        ),
                     )
                     pl_ses[l_h] = se_h
                     pl_cis[l_h] = ci_h
@@ -341,6 +404,125 @@ def _compute_dcdh_bootstrap(
 # =============================================================================
 
 
+def _map_for_target(
+    target_size: int,
+    group_id_to_psu_code: Optional[Dict[Any, int]],
+    eligible_group_ids: Optional[np.ndarray],
+) -> Optional[np.ndarray]:
+    """Build a PSU map for a bootstrap target from IDs (not positions).
+
+    The caller passes:
+    - ``group_id_to_psu_code``: a dict mapping each variance-eligible
+      group ID to its dense PSU code (built once in ``fit()``).
+    - ``eligible_group_ids``: the ordered list of group IDs that
+      correspond to the current target's ``u_centered`` vector.
+
+    Returns an integer array of length ``target_size`` where entry
+    ``i`` is the PSU code for the ``i``-th contributing group.
+
+    Returns ``None`` when no PSU information is available (plain
+    multiplier-bootstrap path — identity across targets).
+
+    Raises ``ValueError`` if ``target_size`` does not match
+    ``len(eligible_group_ids)``: every current dCDH bootstrap target
+    uses the variance-eligible group ordering, so any size mismatch
+    signals that a caller introduced a target whose group subset
+    diverges and should pass its own ``target_group_ids`` rather than
+    reusing the overall eligible list. Also raises ``ValueError`` if
+    any group ID is missing from the dict (signaling misalignment
+    between the target's IF vector and the map's keys).
+    """
+    if group_id_to_psu_code is None or eligible_group_ids is None:
+        return None
+    if target_size != len(eligible_group_ids):
+        raise ValueError(
+            f"Bootstrap target size ({target_size}) does not match "
+            f"eligible_group_ids length ({len(eligible_group_ids)}). "
+            "dCDH's bootstrap contract requires all current targets to "
+            "use the variance-eligible group ordering; if a new target "
+            "has a different ordering, pass target-specific group IDs "
+            "to _map_for_target rather than reusing eligible_group_ids."
+        )
+    try:
+        return np.array(
+            [group_id_to_psu_code[gid] for gid in eligible_group_ids],
+            dtype=np.int64,
+        )
+    except KeyError as e:
+        raise ValueError(
+            f"Group ID {e.args[0]!r} in eligible_group_ids has no entry "
+            f"in group_id_to_psu_code — PSU map is misaligned with the "
+            f"bootstrap target's group set."
+        ) from e
+
+
+def _generate_psu_or_group_weights(
+    n_bootstrap: int,
+    n_groups_target: int,
+    weight_type: str,
+    rng: np.random.Generator,
+    group_to_psu_map: Optional[np.ndarray],
+) -> np.ndarray:
+    """Generate a group-level weight matrix, possibly via PSU broadcasting.
+
+    When ``group_to_psu_map`` is ``None`` or is the identity (each group
+    is its own PSU), generates weights at the group level directly —
+    bit-identical to the pre-PSU-bootstrap contract.
+
+    When ``group_to_psu_map`` has fewer unique values than
+    ``n_groups_target`` (strictly coarser PSU than group), generates
+    weights at the PSU level and broadcasts to groups via the map.
+    This is the Hall-Mammen wild PSU bootstrap.
+
+    Parameters
+    ----------
+    n_bootstrap, weight_type, rng
+        Passed through to generate_bootstrap_weights_batch.
+    n_groups_target : int
+        Number of groups contributing to the target's IF vector.
+    group_to_psu_map : np.ndarray or None
+        Dense integer PSU indices of shape ``(n_groups_target,)``.
+        ``None`` triggers the group-level path.
+
+    Returns
+    -------
+    np.ndarray
+        Shape ``(n_bootstrap, n_groups_target)`` multiplier weights.
+    """
+    if group_to_psu_map is None:
+        return _generate_bootstrap_weights_batch(
+            n_bootstrap=n_bootstrap,
+            n_units=n_groups_target,
+            weight_type=weight_type,
+            rng=rng,
+        )
+    if len(group_to_psu_map) != n_groups_target:
+        raise ValueError(
+            f"group_to_psu_map length ({len(group_to_psu_map)}) does not "
+            f"match n_groups_target ({n_groups_target})."
+        )
+    n_psu = int(np.max(group_to_psu_map)) + 1 if group_to_psu_map.size > 0 else 0
+    if n_psu >= n_groups_target:
+        # Identity (each group its own PSU) — skip the broadcast for a
+        # bit-identical fast path matching the pre-PSU-bootstrap behavior.
+        return _generate_bootstrap_weights_batch(
+            n_bootstrap=n_bootstrap,
+            n_units=n_groups_target,
+            weight_type=weight_type,
+            rng=rng,
+        )
+    # Hall-Mammen wild PSU bootstrap: draw n_psu multipliers, broadcast
+    # via the dense index map so all groups in the same PSU share a
+    # multiplier. Preserves clustered sampling structure.
+    psu_weights = _generate_bootstrap_weights_batch(
+        n_bootstrap=n_bootstrap,
+        n_units=n_psu,
+        weight_type=weight_type,
+        rng=rng,
+    )
+    return psu_weights[:, group_to_psu_map]
+
+
 def _bootstrap_one_target(
     u_centered: np.ndarray,
     divisor: int,
@@ -351,6 +533,7 @@ def _bootstrap_one_target(
     rng: np.random.Generator,
     context: str,
     return_distribution: bool,
+    group_to_psu_map: Optional[np.ndarray] = None,
 ) -> Tuple[float, Tuple[float, float], float, Optional[np.ndarray]]:
     """
     Run the multiplier bootstrap for a single dCDH target.
@@ -367,16 +550,24 @@ def _bootstrap_one_target(
     sample mean of the bootstrap distribution should be approximately
     zero, not the original effect). The original effect is passed
     separately as the centering point for the percentile p-value.
+
+    When ``group_to_psu_map`` is provided (length ``len(u_centered)``,
+    dense integer PSU indices), multiplier weights are generated at the
+    PSU level and broadcast to groups so all groups in the same PSU
+    receive the same bootstrap multiplier. This is the Hall-Mammen wild
+    PSU bootstrap; it reduces to the group-level bootstrap when each
+    group is its own PSU (identity map).
     """
     n_groups_target = u_centered.shape[0]
     if n_groups_target == 0 or divisor == 0:
         return np.nan, (np.nan, np.nan), np.nan, None
 
-    weight_matrix = _generate_bootstrap_weights_batch(
+    weight_matrix = _generate_psu_or_group_weights(
         n_bootstrap=n_bootstrap,
-        n_units=n_groups_target,
+        n_groups_target=n_groups_target,
         weight_type=weight_type,
         rng=rng,
+        group_to_psu_map=group_to_psu_map,
     )
 
     # Each bootstrap replicate: (1 / divisor) * sum_g w_b[g] * u_centered[g]
diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt
index 7bbd950c..c9b2851e 100644
--- a/diff_diff/guides/llms-full.txt
+++ b/diff_diff/guides/llms-full.txt
@@ -266,7 +266,7 @@ est.fit(
     trends_nonparam: Any | None = None,          # DID^s state-set-specific trends
     honest_did: bool = False,                    # HonestDiD sensitivity on placebos
     # ---- survey support ----
-    survey_design: SurveyDesign | None = None,   # pweight + strata/PSU/FPC (TSL)
+    survey_design: SurveyDesign | None = None,   # pweight (TSL or replicate BRR/Fay/JK1/JKn/SDR); n_bootstrap > 0 uses Hall-Mammen PSU multiplier
 ) -> ChaisemartinDHaultfoeuilleResults
 ```
 
@@ -320,9 +320,9 @@ print(f"sigma_fe (sign-flipping threshold): {diagnostic.sigma_fe:.3f}")
 
 **Notes:**
 - Validated against R `DIDmultiplegtDYN` v2.3.3 at horizon `l = 1` via `tests/test_chaisemartin_dhaultfoeuille_parity.py`
-- Placebo SE contract: single-period placebo `DID_M^pl` (`L_max=None`) has `NaN` SE because the per-period aggregation path has no influence-function derivation; inference fields stay NaN-consistent **even when `n_bootstrap > 0`** for the single-period path (bootstrap covers `DID_M`, `DID_+`, and `DID_-` only). Multi-horizon dynamic placebos `DID^{pl}_l` (`L_max >= 1`) have valid analytical SE via the placebo influence function (same cohort-recentered structure as positive horizons, applied to backward outcome differences), with bootstrap SE override when `n_bootstrap > 0`. This is a library extension beyond the dynamic companion paper, which states Theorem 1 variance for `DID_l` only.
+- Placebo SE contract: single-period placebo `DID_M^pl` (`L_max=None`) has `NaN` SE because the per-period aggregation path has no influence-function derivation; inference fields stay NaN-consistent **even when `n_bootstrap > 0`** for the single-period path (single-period bootstrap covers only `DID_M`, `DID_+`, and `DID_-`). Multi-horizon dynamic placebos `DID^{pl}_l` (`L_max >= 1`) have valid analytical SE via the placebo influence function (same cohort-recentered structure as positive horizons, applied to backward outcome differences), with bootstrap SE override when `n_bootstrap > 0` — bootstrap at `L_max >= 1` covers `DID_M`, `DID_+`, `DID_-`, per-horizon event-study effects (`event_study_effects[l]`), and placebo horizons (`placebo_event_study[-l]`), with shared weights across horizons for valid joint (sup-t) bands. This is a library extension beyond the dynamic companion paper, which states Theorem 1 variance for `DID_l` only.
 - The analytical CI is conservative under Assumption 8 (independent groups) of the dynamic companion paper, exact only under iid sampling
-- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred
+- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR). Opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0`. Replicate + `n_bootstrap > 0` rejected with `NotImplementedError` (replicate variance is closed-form)
 
 ### SunAbraham
 
@@ -1672,8 +1672,8 @@ sd_female, data_female = sd.subpopulation(data, mask=lambda df: df['sex'] == 'F'
 
 **Key features:**
 - Taylor Series Linearization (TSL) variance with strata + PSU + FPC
-- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (12 of 16 estimators)
-- Survey-aware bootstrap: multiplier at PSU or Rao-Wu rescaled
+- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (13 of 16 estimators, including dCDH)
+- Survey-aware bootstrap: multiplier at PSU (Hall-Mammen wild; dCDH, staggered) or Rao-Wu rescaled (SyntheticDiD, SunAbraham)
 - DEFF diagnostics, subpopulation analysis, weight trimming (`trim_weights`)
 - Repeated cross-sections: `CallawaySantAnna(panel=False)`
 - Compatibility matrix: see `docs/choosing_estimator.rst` Survey Design Support section
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index b4cd419f..f1c575cf 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -463,7 +463,7 @@ The multiplier bootstrap uses random weights w_i with E[w]=0 and Var(w)=1:
 - [de Chaisemartin, C. & D'Haultfœuille, X. (2020). Two-Way Fixed Effects Estimators with Heterogeneous Treatment Effects. *American Economic Review*, 110(9), 2964-2996.](https://doi.org/10.1257/aer.20181169)
 - [de Chaisemartin, C. & D'Haultfœuille, X. (2022, revised 2024). Difference-in-Differences Estimators of Intertemporal Treatment Effects. NBER Working Paper 29873.](https://www.nber.org/papers/w29873) — Web Appendix Section 3.7.3 contains the cohort-recentered plug-in variance formula implemented here.
 
-**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization; replicate weights and PSU-level bootstrap are deferred. **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles.
+**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR) across all IF sites, plus opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0` (see the full checklist + Notes below for the contract). **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles.
 
 **Key implementation requirements:**
 
@@ -589,7 +589,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 
 - **Note:** When every variance-eligible group forms its own `(D_{g,1}, F_g, S_g)` cohort (a degenerate small-panel case where the cohort framework has zero degrees of freedom), the cohort-recentered plug-in formula is unidentified: cohort recentering subtracts the cohort mean from each group's `U^G_g`, and for singleton cohorts the centered value is exactly zero, so the centered influence function vector collapses to all zeros. The estimator returns `overall_se = NaN` with a `UserWarning` rather than silently collapsing to `0.0` (which would falsely imply infinite precision). The `DID_M` point estimate remains well-defined. The bootstrap path inherits the same degeneracy on these panels — the multiplier weights act on an all-zero vector, so the bootstrap distribution is also degenerate. **Deviation from R `DIDmultiplegtDYN`:** R returns a non-zero SE on the canonical 4-group worked example via small-sample sandwich machinery that Python does not implement. Both responses are valid for a degenerate case; Python's `NaN`+warning is the safer default. To get a non-degenerate SE, include more groups so cohorts have peers (real-world panels typically have `G >> K`).
 
-- **Note (Phase 1 cluster contract):** `ChaisemartinDHaultfoeuille` always clusters at the group level. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group; both inference paths cluster at the user's `group` column with no other option. The constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` with a Phase 1 pointer at construction time (and the same gate fires from `set_params`). Custom clustering at a coarser or finer level than the group is reserved for a future phase. The matching test is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`.
+- **Note (cluster contract):** `ChaisemartinDHaultfoeuille` clusters at the group level by default. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group. The user-facing `cluster=` kwarg is not supported: the constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` at construction time (and the same gate fires from `set_params`) — custom user-specified clustering is reserved for a future phase. **Automatic PSU-level clustering under `survey_design`:** when `survey_design` carries PSUs strictly coarser than the group and `n_bootstrap > 0`, the multiplier bootstrap automatically switches to Hall-Mammen wild PSU clustering (see the Note on survey + bootstrap below). Under the default auto-inject `psu=group`, PSU coincides with group and the group-level and PSU-level paths are identical (bit-for-bit via the identity-map fast path). The matching test for the `cluster=` gate is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`.
 
 - **Note (bootstrap inference surface):** When `n_bootstrap > 0`, the top-level `results.overall_p_value` / `results.overall_conf_int` (and joiners/leavers analogues) hold **percentile-based bootstrap inference** computed by the multiplier bootstrap, NOT normal-theory recomputations from the bootstrap SE. The t-stat (`overall_t_stat`, etc.) is computed from the SE via `safe_inference()[0]` to satisfy the project's anti-pattern rule (never compute `t = effect / se` inline) — bootstrap does not define an alternative t-stat semantic for percentile bootstrap, so the SE-based t-stat is the natural choice. `event_study_effects[1]`, `summary()`, `to_dataframe()`, `is_significant`, and `significance_stars` all read from these top-level fields and therefore reflect the bootstrap inference automatically. The library precedent for this propagation is `imputation.py:790-805`, `two_stage.py:778-787`, and `efficient_did.py:1009-1013`. The single-period placebo (`L_max=None`) still has NaN bootstrap fields; multi-horizon placebos (`L_max >= 1`) have valid bootstrap SE/CI/p via `placebo_horizon_ses/cis/p_values` on the bootstrap results object. The matching test is `test_bootstrap_p_value_and_ci_propagated_to_top_level` in `tests/test_chaisemartin_dhaultfoeuille.py::TestBootstrap`.
 
@@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 
 - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`.
 
-- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights.
+- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights.
 
 - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning.
 
@@ -633,7 +633,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 - [x] Single-lag placebo `DID_M^pl` (point estimate; SE deferred to Phase 2)
 - [x] TWFE decomposition diagnostic (Theorem 1 of AER 2020): per-cell weights, fraction negative, `sigma_fe`, `beta_fe`
 - [x] Standalone `twowayfeweights()` helper for users who only want the TWFE diagnostic
-- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group
+- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group by default; automatically upgraded to PSU-level Hall-Mammen wild clustering under `survey_design` with strictly-coarser PSUs
 - [x] `drop_larger_lower=True` default (matches R `DIDmultiplegtDYN`); `False` opt-in with explicit inconsistency warning
 - [x] Singleton-baseline filter (footnote 15 of dynamic paper, variance computation only) with explicit warning
 - [x] Never-switching groups participate in the variance via stable-control roles after the Round 2 full-IF fix; `n_groups_dropped_never_switching` field retained as backwards-compatibility metadata only
@@ -648,9 +648,9 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 - [x] Heterogeneity testing via saturated OLS (Web Appendix Section 1.5, Lemma 7)
 - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6)
 - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface
-- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Replicate weights and PSU-level bootstrap deferred.
-- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=<group column>` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally).
-- **Note (survey + bootstrap fallback):** When `survey_design` and `n_bootstrap > 0` are both active, the multiplier bootstrap uses group-level Rademacher/Mammen/Webb weights rather than PSU-level resampling. A `UserWarning` is emitted from `fit()`. This is conservative when groups are finer than PSUs; a PSU-level survey bootstrap is deferred to a future release. For design-based analytical variance, the TSL path (non-bootstrap) is the recommended contract.
+- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`.
+- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=<group column>` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). Under replicate-weight designs, the same group-level IF expansion is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed.
+- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design.
 
 ---
 
diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py
index 1193ce60..081d9e92 100644
--- a/tests/test_chaisemartin_dhaultfoeuille.py
+++ b/tests/test_chaisemartin_dhaultfoeuille.py
@@ -404,32 +404,35 @@ def test_survey_design_rejects_fweight(self, data):
 
     def test_cluster_parameter_raises_not_implemented(self, data):
         """
-        Per Phase 1 cluster contract: dCDH always clusters at the
-        group level via the cohort-recentered influence function
-        (analytical SEs) and the multiplier bootstrap (also grouped at
-        the group column). Custom clustering is not supported in
-        Phase 1.
+        Per the dCDH cluster contract: dCDH clusters at the group
+        level by default via the cohort-recentered influence function
+        (analytical SEs) and the multiplier bootstrap. Under
+        ``survey_design`` with strictly-coarser PSUs the bootstrap
+        automatically upgrades to PSU-level Hall-Mammen wild. User-
+        specified clustering via the ``cluster=`` kwarg is not
+        supported.
 
         The reviewer flagged that ``cluster`` was previously accepted
         on ``__init__`` and stored on ``self.cluster`` but never
         actually read by ``fit()`` or ``_compute_dcdh_bootstrap()``,
-        making it a silent no-op. This test pins the new contract: any
+        making it a silent no-op. This test pins the contract: any
         non-None cluster value raises ``NotImplementedError`` at
         construction time with a message naming the offending value
-        and pointing at the Phase 1 reservation. The same gate fires
-        from ``set_params``.
+        and pointing at the no-custom-clustering reservation. The
+        same gate fires from ``set_params``.
 
-        See REGISTRY.md ``Note (Phase 1 cluster contract)``.
+        See REGISTRY.md ``Note (cluster contract)``.
         """
+        pattern = r"cluster.*(not supported|reserved for a future)"
         # __init__ rejects any non-None cluster
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             ChaisemartinDHaultfoeuille(cluster="state")
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             ChaisemartinDHaultfoeuille(cluster="unit")
 
         # set_params after construction also rejects
         est = ChaisemartinDHaultfoeuille()
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             est.set_params(cluster="state")
 
         # cluster=None still works (the only supported value)
@@ -439,7 +442,7 @@ def test_cluster_parameter_raises_not_implemented(self, data):
 
         # The convenience function also rejects (forward-compat gate
         # propagates through the wrapper at __init__ time)
-        with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"):
+        with pytest.raises(NotImplementedError, match=pattern):
             chaisemartin_dhaultfoeuille(
                 data,
                 outcome="outcome",
diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py
index c2c7c0f0..1aa9f14c 100644
--- a/tests/test_survey_dcdh.py
+++ b/tests/test_survey_dcdh.py
@@ -317,8 +317,16 @@ def test_varying_weights_change_att(self, base_data):
             result_survey.overall_att, abs=0.01
         )
 
-    def test_rejects_replicate_weights(self, base_data):
-        """Replicate weight variance not yet supported."""
+    def test_rejects_replicate_weights_with_bootstrap(self, base_data):
+        """Replicate weights combined with n_bootstrap > 0 is rejected.
+
+        Replicate variance is closed-form (compute_replicate_if_variance);
+        combining it with a multiplier bootstrap would double-count
+        variance. Matches library precedent in efficient_did.py:989,
+        staggered.py:1869, two_stage.py:251-253. The standalone
+        replicate-only path (n_bootstrap=0) is supported separately;
+        see tests/test_survey_dcdh_replicate_psu.py.
+        """
         df = base_data.copy()
         df["pw"] = 1.0
         df["rep1"] = 1.0
@@ -328,8 +336,8 @@ def test_rejects_replicate_weights(self, base_data):
             replicate_weights=["rep1", "rep2"],
             replicate_method="BRR",
         )
-        with pytest.raises(NotImplementedError, match="Replicate"):
-            ChaisemartinDHaultfoeuille().fit(
+        with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"):
+            ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit(
                 df,
                 outcome="outcome",
                 group="group",
@@ -366,9 +374,31 @@ def test_multi_horizon_survey_runs(self, data_with_survey):
 
 class TestBootstrapSurveyWarning:
 
-    def test_bootstrap_survey_emits_warning(self, data_with_survey):
+    def test_bootstrap_survey_auto_inject_no_warning(self, data_with_survey):
+        """Under auto-inject psu=group, Hall-Mammen wild PSU bootstrap
+        coincides with the group-level multiplier bootstrap — so no
+        warning should fire. The old 'PSU-level deferred' warning has
+        been replaced with a conditional one that only fires when the
+        user passes a strictly coarser PSU.
+
+        See the new test file tests/test_survey_dcdh_replicate_psu.py for
+        the strictly-coarser-PSU case where the warning DOES fire.
+        """
         sd = SurveyDesign(weights="pw")
-        with pytest.warns(UserWarning, match="group-level multiplier"):
+        import warnings as _w
+
+        with _w.catch_warnings():
+            _w.simplefilter("error", UserWarning)
+            # Ignore warnings unrelated to the bootstrap-PSU contract.
+            _w.filterwarnings(
+                "ignore", message="Single-period placebo SE"
+            )
+            _w.filterwarnings(
+                "ignore", message="pweight weights normalized"
+            )
+            _w.filterwarnings(
+                "ignore", message="Assumption 11"
+            )
             ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit(
                 data_with_survey,
                 outcome="outcome",
@@ -728,9 +758,11 @@ def test_twfe_helper_rejects_non_pweight(self, base_data):
                 survey_design=sd,
             )
 
-    def test_twfe_helper_rejects_replicate_weights(self, base_data):
-        """Replicate-weight survey designs must be rejected by the helper,
-        matching fit()'s NotImplementedError contract."""
+    def test_twfe_helper_accepts_replicate_weights(self, base_data):
+        """Replicate-weight designs are accepted by the helper (no SE field
+        on TWFEWeightsResult, so only cell aggregation is affected). Matches
+        fit()'s new contract where replicate variance runs analytically via
+        compute_replicate_if_variance."""
         from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights
 
         df_ = base_data.copy()
@@ -742,13 +774,15 @@ def test_twfe_helper_rejects_replicate_weights(self, base_data):
             replicate_weights=["rep1", "rep2"],
             replicate_method="BRR",
         )
-        with pytest.raises(NotImplementedError, match="Replicate"):
-            twowayfeweights(
-                df_,
-                outcome="outcome", group="group",
-                time="period", treatment="treatment",
-                survey_design=sd,
-            )
+        result = twowayfeweights(
+            df_,
+            outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(result.beta_fe)
+        assert np.isfinite(result.sigma_fe)
+        assert 0.0 <= result.fraction_negative <= 1.0
 
 
 # ── Test: TWFE diagnostic oracle under survey ───────────────────────
diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py
new file mode 100644
index 00000000..595f80bd
--- /dev/null
+++ b/tests/test_survey_dcdh_replicate_psu.py
@@ -0,0 +1,1166 @@
+"""Replicate-weight variance and PSU-level bootstrap tests for dCDH.
+
+Covers the two survey-variance extensions added after PR #307:
+1. Replicate-weight variance (BRR/Fay/JK1/JKn/SDR) via the unified
+   `compute_replicate_if_variance` helper, routed through the inline
+   branch in `_survey_se_from_group_if` (Class A sites) and the
+   parallel inline branch in `_compute_heterogeneity_test` (Class B).
+2. PSU-level Hall-Mammen wild bootstrap via `group_to_psu_map`
+   threaded through `_compute_dcdh_bootstrap`, with an identity-map
+   fast path preserving auto-inject `psu=group` semantics.
+"""
+
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from diff_diff import ChaisemartinDHaultfoeuille, SurveyDesign
+from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights
+
+
+# ── Fixtures ────────────────────────────────────────────────────────
+
+
+REPLICATE_METHODS = ["BRR", "Fay", "JK1", "JKn", "SDR"]
+
+
+def _make_reversible_panel(
+    n_groups: int = 30,
+    n_periods: int = 5,
+    seed: int = 42,
+) -> pd.DataFrame:
+    """Simple reversible-treatment panel with two switch cohorts."""
+    rng = np.random.default_rng(seed)
+    rows = []
+    for g in range(n_groups):
+        f = 2 if g < n_groups // 2 else 3
+        for t in range(n_periods):
+            d = 1 if t >= f else 0
+            y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3)
+            rows.append(
+                {
+                    "group": g,
+                    "period": t,
+                    "treatment": d,
+                    "outcome": y,
+                    "pw": 1.0,
+                    "x_het": float(g) * 0.1,
+                }
+            )
+    return pd.DataFrame(rows)
+
+
+def _attach_replicate_weights(
+    df: pd.DataFrame,
+    R: int,
+    method: str,
+    seed: int = 101,
+) -> pd.DataFrame:
+    """Add R replicate-weight columns and return the mutated DataFrame."""
+    rng = np.random.default_rng(seed)
+    df = df.copy()
+    for r in range(R):
+        if method == "BRR" or method == "Fay" or method == "SDR":
+            df[f"rep{r}"] = rng.choice([0.5, 1.5], size=len(df))
+        elif method in ("JK1", "JKn"):
+            df[f"rep{r}"] = rng.uniform(0.5, 1.5, size=len(df))
+    return df
+
+
+def _rep_cols(R: int) -> list:
+    return [f"rep{r}" for r in range(R)]
+
+
+def _build_replicate_design(R: int, method: str) -> SurveyDesign:
+    """Build a SurveyDesign object with method-specific required params."""
+    if method == "Fay":
+        return SurveyDesign(
+            weights="pw",
+            replicate_weights=_rep_cols(R),
+            replicate_method="Fay",
+            fay_rho=0.5,
+        )
+    if method == "JKn":
+        return SurveyDesign(
+            weights="pw",
+            replicate_weights=_rep_cols(R),
+            replicate_method="JKn",
+            replicate_strata=[r % 2 for r in range(R)],
+        )
+    return SurveyDesign(
+        weights="pw",
+        replicate_weights=_rep_cols(R),
+        replicate_method=method,
+    )
+
+
+@pytest.fixture(scope="module")
+def base_panel():
+    return _make_reversible_panel(n_groups=30, n_periods=5, seed=42)
+
+
+@pytest.fixture
+def replicate_design():
+    """Helper constructing a SurveyDesign with replicate weights for a given method."""
+
+    def _build(df: pd.DataFrame, R: int, method: str, seed: int = 101) -> pd.DataFrame:
+        return _attach_replicate_weights(df, R=R, method=method, seed=seed)
+
+    return _build
+
+
+def _make_strictly_coarser_psu_panel(
+    n_groups_per_psu: int = 3,
+    n_psu: int = 4,
+    n_periods: int = 5,
+    seed: int = 17,
+) -> pd.DataFrame:
+    """Panel where groups nest strictly inside PSUs.
+
+    12 groups across 4 PSUs (3 groups/PSU). Two switch cohorts so the
+    estimator has both joiners at different F_g.
+    """
+    rng = np.random.default_rng(seed)
+    n_groups = n_groups_per_psu * n_psu
+    rows = []
+    for g in range(n_groups):
+        psu = g // n_groups_per_psu
+        f = 2 if g < n_groups // 2 else 3
+        for t in range(n_periods):
+            d = 1 if t >= f else 0
+            y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3)
+            rows.append(
+                {
+                    "group": g,
+                    "period": t,
+                    "treatment": d,
+                    "outcome": y,
+                    "pw": 1.0,
+                    "psu": psu,
+                }
+            )
+    return pd.DataFrame(rows)
+
+
+# ── 1. Class A replicate (sites via _survey_se_from_group_if) ────────
+
+
+class TestReplicateClassA:
+    """Replicate variance wired through the inline branch in
+    `_survey_se_from_group_if` — inherited by overall DID_M, joiners
+    DID_+, leavers DID_-, multi-horizon DID_l, placebo DID^pl_l.
+    """
+
+    @pytest.mark.parametrize("method", REPLICATE_METHODS)
+    def test_overall_se_finite(self, base_panel, replicate_design, method):
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(res.overall_att)
+        assert np.isfinite(res.overall_se)
+        assert res.overall_se > 0
+
+    @pytest.mark.parametrize("method", REPLICATE_METHODS)
+    def test_inference_fields_finite(self, base_panel, replicate_design, method):
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+        )
+        assert np.isfinite(res.overall_t_stat)
+        assert np.isfinite(res.overall_p_value)
+        assert np.all(np.isfinite(res.overall_conf_int))
+
+    @pytest.mark.parametrize("method", ["BRR", "JK1"])
+    def test_df_survey_reflects_n_valid(self, base_panel, replicate_design, method):
+        R = 24
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+        )
+        # All replicates should succeed on a clean panel → df = n_valid - 1 = R - 1
+        assert res.survey_metadata.df_survey == R - 1
+
+    def test_jk1_converges_to_tsl(self, replicate_design):
+        """JK1 replicate SE should be in the same order of magnitude as
+        analytical TSL SE on a large panel. Uses a loose 50% tolerance
+        because convergence depends on weight-construction details; the
+        stringent R-parity convergence test lives in
+        `test_survey_phase6.py`."""
+        df = _make_reversible_panel(n_groups=60, n_periods=6, seed=42)
+        R = 30
+        df_rep = replicate_design(df, R=R, method="JK1")
+
+        sd_rep = _build_replicate_design(R, "JK1")
+        sd_tsl = SurveyDesign(weights="pw")
+
+        res_rep = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_rep,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_rep,
+        )
+        res_tsl = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_tsl,
+        )
+        # Same point estimate (both unweighted)
+        assert res_rep.overall_att == pytest.approx(res_tsl.overall_att, rel=1e-6)
+        # SEs should be within 50% of each other (broad envelope)
+        ratio = res_rep.overall_se / res_tsl.overall_se
+        assert 0.5 <= ratio <= 2.0, (
+            f"Replicate SE ({res_rep.overall_se:.4f}) and TSL SE "
+            f"({res_tsl.overall_se:.4f}) differ by more than 2x."
+        )
+
+    @pytest.mark.parametrize("method", ["BRR", "JK1"])
+    def test_multi_horizon_under_replicate(
+        self, replicate_design, method
+    ):
+        """Multi-horizon DID_l inherits the Class A dispatch — horizon 1
+        (always identifiable on this panel) should produce a finite
+        replicate-based SE."""
+        df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42)
+        R = 20
+        df = replicate_design(df, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            L_max=1,
+        )
+        info = res.event_study_effects[1]
+        assert np.isfinite(info["effect"])
+        assert np.isfinite(info["se"]) and info["se"] > 0
+
+    def test_placebo_under_replicate(self, replicate_design):
+        """Placebo DID^{pl}_l under replicate weights carries finite SE,
+        p-value, and CI. Placebo horizons are stored under NEGATIVE
+        keys in ``placebo_event_study`` (see REGISTRY.md line 546 —
+        ``-l`` corresponds to backward outcome difference
+        ``Y_{g, F_g-1-l} - Y_{g, F_g-1}``). Pre-R3-fix this test used
+        key ``1`` and silently skipped the assertion."""
+        df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42)
+        R = 20
+        df = replicate_design(df, R=R, method="BRR")
+        sd = _build_replicate_design(R, "BRR")
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            L_max=1,
+        )
+        assert res.placebo_event_study, (
+            "Expected placebo_event_study to be populated for L_max=1 "
+            "reversible panel"
+        )
+        # Negative key convention: placebo horizon l → key -l.
+        assert -1 in res.placebo_event_study, (
+            f"Expected key -1 in placebo_event_study (got keys "
+            f"{sorted(res.placebo_event_study)})"
+        )
+        info = res.placebo_event_study[-1]
+        assert info["n_obs"] > 0, "Placebo horizon should have observations"
+        assert np.isfinite(info["se"]) and info["se"] > 0
+        assert np.isfinite(info["p_value"])
+        assert np.all(np.isfinite(info["conf_int"]))
+
+    def test_did_x_replicate(self, base_panel, replicate_design):
+        """DID^X covariate adjustment flows through the Class A dispatch
+        (Y_mat residualization happens upstream of IF)."""
+        df = base_panel.copy()
+        rng = np.random.default_rng(123)
+        df["cov1"] = rng.normal(size=len(df))
+        R = 20
+        df = replicate_design(df, R=R, method="JK1")
+        sd = _build_replicate_design(R, "JK1")
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            controls=["cov1"],
+            L_max=1,
+        )
+        assert np.isfinite(res.overall_att)
+        assert np.isfinite(res.overall_se) and res.overall_se > 0
+
+
+# ── 2. Class B replicate (heterogeneity + twowayfeweights) ──────────
+
+
+class TestReplicateClassB:
+    """Sites that call `compute_survey_if_variance` directly (or don't
+    route through the variance path at all) and need their own
+    replicate handling."""
+
+    @pytest.mark.parametrize("method", REPLICATE_METHODS)
+    def test_heterogeneity_se_finite(self, base_panel, replicate_design, method):
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd,
+            L_max=1,
+            heterogeneity="x_het",
+        )
+        assert res.heterogeneity_effects is not None
+        info = res.heterogeneity_effects[1]
+        assert np.isfinite(info["beta"])
+        assert np.isfinite(info["se"]) and info["se"] > 0
+
+    def test_twowayfeweights_accepts_replicate(self, base_panel, replicate_design):
+        """TWFE diagnostic produces the same beta_fe / sigma_fe /
+        fraction_negative under a replicate design as under a
+        pweight-only design (replicate weights only affect
+        `resolved.weights` cell aggregation)."""
+        R = 20
+        df = replicate_design(base_panel, R=R, method="BRR")
+        sd_rep = _build_replicate_design(R, "BRR")
+        sd_plain = SurveyDesign(weights="pw")
+
+        res_rep = twowayfeweights(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_rep,
+        )
+        res_plain = twowayfeweights(
+            df,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+            survey_design=sd_plain,
+        )
+        # Cell aggregation uses resolved.weights; replicate_weights are
+        # present but don't change the aggregated diagnostics.
+        assert res_rep.beta_fe == pytest.approx(res_plain.beta_fe, rel=1e-10)
+        assert res_rep.sigma_fe == pytest.approx(res_plain.sigma_fe, rel=1e-10)
+        assert res_rep.fraction_negative == pytest.approx(
+            res_plain.fraction_negative, rel=1e-10
+        )
+
+
+# ── 3. PSU-level Hall-Mammen wild bootstrap ─────────────────────────
+
+
+class TestPSUBootstrap:
+
+    def test_auto_inject_bit_identical_to_group_level(self, base_panel):
+        """Under auto-inject psu=group the Hall-Mammen PSU bootstrap
+        collapses to the group-level multiplier bootstrap via the
+        identity-map fast path — producing bit-identical SE."""
+        df = base_panel.copy()
+        sd_auto = SurveyDesign(weights="pw")
+        sd_explicit = SurveyDesign(weights="pw", psu="group")
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            r_auto = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd_auto,
+            )
+            r_explicit = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd_explicit,
+            )
+        assert r_auto.overall_se == pytest.approx(r_explicit.overall_se, rel=1e-10)
+
+    def test_coarser_psu_produces_finite_se(self):
+        """Under a strictly coarser PSU, the Hall-Mammen PSU bootstrap
+        still produces a finite SE (the value depends on within-PSU
+        IF correlation)."""
+        df = _make_strictly_coarser_psu_panel()
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+        assert np.isfinite(res.overall_att)
+        assert np.isfinite(res.overall_se) and res.overall_se > 0
+
+    def test_no_warning_under_auto_inject(self, base_panel):
+        """No UserWarning about PSU-level bootstrap should fire under
+        auto-inject psu=group (the warning only fires for strictly
+        coarser PSU)."""
+        sd = SurveyDesign(weights="pw")
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit(
+                base_panel,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+        assert not any(
+            "Hall-Mammen wild" in str(w.message)
+            or "group-level multiplier" in str(w.message)
+            for w in caught
+        ), (
+            "Bootstrap-PSU warning should not fire under auto-inject psu=group, "
+            f"but got: {[str(w.message) for w in caught]}"
+        )
+
+    def test_warning_under_coarser_psu(self):
+        """When PSU is strictly coarser than group, the Hall-Mammen
+        warning fires."""
+        df = _make_strictly_coarser_psu_panel()
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with pytest.warns(UserWarning, match="Hall-Mammen wild"):
+            ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+
+    @pytest.mark.parametrize("weight_type", ["rademacher", "mammen", "webb"])
+    def test_all_weight_types_under_psu(self, weight_type):
+        """Rademacher / Mammen / Webb multipliers should all produce
+        finite SE under the PSU-level path."""
+        df = _make_strictly_coarser_psu_panel()
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(
+                n_bootstrap=200, seed=1, bootstrap_weights=weight_type
+            ).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+        assert np.isfinite(res.overall_se) and res.overall_se > 0
+
+    def test_multi_horizon_psu_bootstrap(self):
+        """Event study bootstrap inherits PSU broadcasting."""
+        df = _make_strictly_coarser_psu_panel(n_periods=6)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=300, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=1,
+            )
+        info = res.event_study_effects[1]
+        assert np.isfinite(info["effect"])
+        assert np.isfinite(info["se"]) and info["se"] > 0
+
+    def test_multi_horizon_shared_draw_under_coarser_psu(self):
+        """End-to-end test of the shared-draw multi-horizon bootstrap
+        under a strictly coarser PSU with L_max >= 2. Exercises the
+        per-horizon invariant assertion + sup-t band computation
+        through the Hall-Mammen wild PSU path."""
+        df = _make_strictly_coarser_psu_panel(n_periods=7)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=2,
+            )
+        # Both horizons should produce finite bootstrap SEs via the
+        # shared-draw path projected through the PSU map.
+        for lag in (1, 2):
+            info = res.event_study_effects[lag]
+            if info["n_obs"] > 0:
+                assert np.isfinite(info["effect"])
+                assert np.isfinite(info["se"]) and info["se"] > 0
+
+    def test_sup_t_under_coarser_psu(self):
+        """Sup-t critical value is computable under coarser PSU."""
+        df = _make_strictly_coarser_psu_panel(n_periods=6)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=2,
+            )
+        # sup_t_bands may be None if only one horizon is valid or if
+        # the cband critical value was not computed.
+        if res.sup_t_bands is not None:
+            assert res.sup_t_bands.get("cband_crit_value") is None or (
+                res.sup_t_bands["cband_crit_value"] > 1.0
+            )
+
+
+# ── 4. Invariants and cross-cutting contracts ────────────────────────
+
+
+class TestInvariants:
+
+    def test_replicate_plus_bootstrap_rejected(self, base_panel, replicate_design):
+        """Replicate variance + n_bootstrap > 0 raises NotImplementedError."""
+        R = 10
+        df = replicate_design(base_panel, R=R, method="BRR")
+        sd = _build_replicate_design(R, "BRR")
+        with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"):
+            ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+            )
+
+    def test_non_survey_unchanged(self, base_panel):
+        """Non-survey fits (survey_design=None) should produce bit-
+        identical SE to the pre-PSU-bootstrap behavior."""
+        r1 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+            base_panel,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+        )
+        r2 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit(
+            base_panel,
+            outcome="outcome",
+            group="group",
+            time="period",
+            treatment="treatment",
+        )
+        assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10)
+
+    def test_map_for_target_id_lookup(self):
+        """`_map_for_target` builds the PSU map from group IDs via dict
+        lookup, not positional reuse. All current dCDH bootstrap
+        targets use the variance-eligible group ordering, so the
+        helper looks up each target group's PSU via
+        `group_id_to_psu_code`. Length mismatch → ValueError (loud
+        failure rather than silent miscluster)."""
+        from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import (
+            _map_for_target,
+        )
+
+        gid_to_psu = {"a": 0, "b": 0, "c": 1, "d": 1, "e": 2}
+        eligible = np.asarray(["a", "b", "c", "d", "e"])
+        expected = np.array([0, 0, 1, 1, 2], dtype=np.int64)
+        built = _map_for_target(5, gid_to_psu, eligible)
+        assert built is not None
+        assert np.array_equal(built, expected)
+
+        # None passthrough when no PSU info
+        assert _map_for_target(5, None, None) is None
+
+        # Length mismatch → loud failure
+        with pytest.raises(ValueError, match="target size"):
+            _map_for_target(3, gid_to_psu, eligible)
+
+        # Missing group ID → loud failure
+        gid_to_psu_incomplete = {"a": 0, "b": 0, "c": 1, "d": 1}
+        with pytest.raises(ValueError, match="has no entry"):
+            _map_for_target(5, gid_to_psu_incomplete, eligible)
+
+        # Non-prefix reordering: different ordered group IDs produce a
+        # different PSU map even if the dict is the same. This is the
+        # key invariant the previous prefix-slicing lacked.
+        eligible_reordered = np.asarray(["c", "a", "d", "e", "b"])
+        built_reordered = _map_for_target(5, gid_to_psu, eligible_reordered)
+        assert built_reordered is not None
+        expected_reordered = np.array([1, 0, 1, 2, 0], dtype=np.int64)
+        assert np.array_equal(built_reordered, expected_reordered)
+
+    def test_generate_psu_or_group_weights_broadcast(self):
+        """Direct unit test of the PSU-level weight generator:
+        groups mapped to the same PSU receive the same multiplier
+        within a single bootstrap replicate (Hall-Mammen wild PSU
+        contract)."""
+        from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import (
+            _generate_psu_or_group_weights,
+        )
+
+        # 6 groups → 3 PSUs: groups (0,1), (2,3), (4,5) share multipliers.
+        group_to_psu = np.array([0, 0, 1, 1, 2, 2], dtype=np.int64)
+        rng = np.random.default_rng(0)
+        W = _generate_psu_or_group_weights(
+            n_bootstrap=50,
+            n_groups_target=6,
+            weight_type="rademacher",
+            rng=rng,
+            group_to_psu_map=group_to_psu,
+        )
+        assert W.shape == (50, 6)
+        # Groups in the same PSU must receive the same multiplier
+        # within each bootstrap replicate.
+        assert np.array_equal(W[:, 0], W[:, 1])
+        assert np.array_equal(W[:, 2], W[:, 3])
+        assert np.array_equal(W[:, 4], W[:, 5])
+        # Different PSUs should NOT all produce identical columns
+        # (given 50 replicates and Rademacher weights, collision
+        # probability is 2^-49 → effectively 0).
+        assert not np.array_equal(W[:, 0], W[:, 2])
+
+    def test_generate_psu_or_group_weights_identity(self):
+        """Identity map (each group its own PSU) uses the fast path and
+        produces group-level weights bit-identical to the non-PSU path."""
+        from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import (
+            _generate_psu_or_group_weights,
+        )
+
+        identity_map = np.arange(6, dtype=np.int64)
+        rng1 = np.random.default_rng(0)
+        W_identity = _generate_psu_or_group_weights(
+            n_bootstrap=20,
+            n_groups_target=6,
+            weight_type="mammen",
+            rng=rng1,
+            group_to_psu_map=identity_map,
+        )
+        rng2 = np.random.default_rng(0)
+        W_plain = _generate_psu_or_group_weights(
+            n_bootstrap=20,
+            n_groups_target=6,
+            weight_type="mammen",
+            rng=rng2,
+            group_to_psu_map=None,
+        )
+        assert np.array_equal(W_identity, W_plain)
+
+    @pytest.mark.parametrize("method", ["BRR", "JK1"])
+    def test_honest_did_under_replicate(
+        self, base_panel, replicate_design, method
+    ):
+        """HonestDiD bounds should flow through replicate SE and the
+        reduced df_survey (df = min(n_valid) - 1)."""
+        R = 20
+        df = replicate_design(base_panel, R=R, method=method)
+        sd = _build_replicate_design(R, method)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df,
+                outcome="outcome",
+                group="group",
+                time="period",
+                treatment="treatment",
+                survey_design=sd,
+                L_max=1,
+                honest_did=True,
+            )
+        assert res.survey_metadata.df_survey == R - 1
+        if res.honest_did_results is not None:
+            # HonestDiD bounds exist and are finite; exact structure is
+            # honest_did.py internals. Just verify the attribute is
+            # populated so the SE flow-through is exercised.
+            assert res.honest_did_results is not None
+
+    def test_rank_deficient_replicate_uses_design_df(self, base_panel):
+        """Duplicated replicate columns → QR-rank < R → design df < R-1.
+
+        Regression for PR #311 CI review R1 P1. The main dCDH surface,
+        ``survey_metadata.df_survey``, and HonestDiD must all use the
+        reduced effective df (``min(design_df, n_valid - 1)``) rather
+        than the naive ``n_valid - 1`` that ignored the design's QR
+        rank. Before the fix, ``survey_metadata.df_survey`` stayed at
+        ``R - 1`` for rank-deficient replicate designs — which is
+        anti-conservative for t-inference.
+        """
+        R = 12
+        df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3)
+        # Force duplicate columns to induce rank deficiency:
+        # rep1 = rep0, rep3 = rep2 → true rank = R - 2.
+        df["rep1"] = df["rep0"]
+        df["rep3"] = df["rep2"]
+        sd = _build_replicate_design(R, "BRR")
+        # Read off the design-level df via the resolved design so the
+        # test is robust to internal QR tolerance changes.
+        resolved = sd.resolve(df)
+        design_df = resolved.df_survey
+        assert design_df is not None and design_df < R - 1, (
+            f"Expected rank deficiency: design df={design_df} "
+            f"should be < {R - 1}"
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1, honest_did=True,
+            )
+        # survey_metadata carries the final (reduced) df — not R - 1.
+        assert res.survey_metadata.df_survey == design_df, (
+            f"Expected survey_metadata.df_survey == design_df "
+            f"({design_df}), got {res.survey_metadata.df_survey}."
+        )
+        # HonestDiD reads results.survey_metadata.df_survey directly
+        # (honest_did.py:973), so the same reduced df flows through.
+        # We assert the HonestDiD result is populated (df flow-through
+        # is exercised by the honest_did=True call above).
+        assert res.honest_did_results is not None
+
+    def test_dropped_replicate_reduces_df(self, base_panel):
+        """When some replicate columns produce degenerate per-replicate
+        estimates (e.g., all-zero weight vectors), the design's QR rank
+        drops below R — which flows through to the effective df.
+
+        Regression for PR #311 CI review R1 P2. Even when every IF site
+        reports the full ``n_valid = R``, the persisted df must never
+        exceed the design-level df. Pre-fix, ``_effective_df_survey``
+        returned ``n_valid - 1 = R - 1`` which over-counted degrees of
+        freedom on a reduced-rank design.
+        """
+        R = 15
+        df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=4)
+        # Zero out two replicate columns → drops rank by 2.
+        df["rep0"] = 0.0
+        df["rep1"] = 0.0
+        sd = _build_replicate_design(R, "JK1")
+        resolved = sd.resolve(df)
+        design_df = resolved.df_survey
+        assert design_df is not None and design_df <= R - 3, (
+            f"Expected zero-column rank deficiency: design df={design_df} "
+            f"should be <= {R - 3}"
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        # Persisted df must be capped by the design df, not R - 1.
+        assert res.survey_metadata.df_survey is not None
+        assert res.survey_metadata.df_survey <= design_df, (
+            f"Expected survey_metadata.df_survey <= design_df "
+            f"({design_df}), got {res.survey_metadata.df_survey}. "
+            f"Effective df must NEVER exceed the design-level df under "
+            f"replicate variance."
+        )
+        assert res.survey_metadata.df_survey < R - 1, (
+            f"Expected persisted df to reflect reduced rank (< R - 1 = "
+            f"{R - 1}), got {res.survey_metadata.df_survey}."
+        )
+
+    def test_rank_1_replicate_forces_nan_inference(self, base_panel):
+        """All replicate columns identical → QR-rank = 1 → design df
+        undefined (None). Even if per-IF replicate SEs come back
+        finite, every public inference field (t_stat / p_value /
+        conf_int) MUST be NaN — otherwise users get z-based inference
+        silently instead of the NaN contract documented in REGISTRY.md.
+
+        R2 P0 regression: before the fix, ``_effective_df_survey``
+        returned None, ``safe_inference(df=None)`` computed z-based
+        statistics, and a rank-1 replicate design with finite SEs
+        would produce valid-looking (but wrong) p-values/CIs.
+        """
+        R = 8
+        df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3)
+        # Make every replicate column identical → QR-rank = 1.
+        rep_col_shared = df["rep0"].copy()
+        for r in range(R):
+            df[f"rep{r}"] = rep_col_shared
+        sd = _build_replicate_design(R, "BRR")
+        resolved = sd.resolve(df)
+        assert resolved.df_survey is None, (
+            "Rank-1 replicate matrix must have undefined design df"
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        # survey_metadata.df_survey stays None — the honest undefined
+        # signal for downstream consumers.
+        assert res.survey_metadata.df_survey is None
+        # Main dCDH inference fields are NaN via the _inference_df
+        # coercion (None → 0 under replicate → safe_inference NaN
+        # branch).
+        assert np.isnan(res.overall_t_stat)
+        assert np.isnan(res.overall_p_value)
+        assert not np.all(np.isfinite(res.overall_conf_int))
+        # Event study horizon 1 is also NaN.
+        if 1 in res.event_study_effects:
+            info = res.event_study_effects[1]
+            assert np.isnan(info["t_stat"])
+            assert np.isnan(info["p_value"])
+
+    def test_heterogeneity_replicate_cross_surface_df_consistency(
+        self, base_panel,
+    ):
+        """With ``heterogeneity=`` active under a rank-deficient
+        replicate design, every public surface — top-level dCDH,
+        event study, heterogeneity, ``survey_metadata.df_survey``,
+        and HonestDiD — MUST use the SAME final effective df.
+
+        R2 P1 regression: before the fix, the main surfaces
+        (``overall_*``, ``event_study_effects``) computed t/p/CI
+        with an intermediate ``_df_survey`` that did not include
+        heterogeneity's ``n_valid_het``. Then
+        ``survey_metadata.df_survey`` was overwritten with the
+        post-heterogeneity min — so HonestDiD used a different df
+        than the main dCDH surface.
+        """
+        from scipy import stats as _stats
+
+        R = 12
+        df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=7)
+        # Rank deficiency: 2 duplicated pairs → rank = R - 2 = 10.
+        df["rep1"] = df["rep0"]
+        df["rep3"] = df["rep2"]
+        sd = _build_replicate_design(R, "BRR")
+        resolved = sd.resolve(df)
+        design_df = resolved.df_survey
+        assert design_df is not None and design_df < R - 1
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+                heterogeneity="x_het",
+                honest_did=True,
+            )
+        final_df = res.survey_metadata.df_survey
+        # All sites report n_valid == R → effective df = min(design_df, R-1)
+        # = design_df.
+        assert final_df == design_df
+        # Verify overall inference was computed with df=final_df by
+        # checking p-value against t(final_df) — would disagree if
+        # df=R-1 or df=None was used.
+        t_stat = res.overall_att / res.overall_se
+        expected_p = 2 * _stats.t.sf(abs(t_stat), df=final_df)
+        assert res.overall_p_value == pytest.approx(expected_p, rel=1e-6)
+        # Same check for heterogeneity surface.
+        if res.heterogeneity_effects:
+            het_info = res.heterogeneity_effects[1]
+            if np.isfinite(het_info["se"]):
+                het_t = het_info["beta"] / het_info["se"]
+                expected_het_p = 2 * _stats.t.sf(abs(het_t), df=final_df)
+                assert het_info["p_value"] == pytest.approx(
+                    expected_het_p, rel=1e-6
+                )
+        # HonestDiD bounds were computed from the same survey_metadata;
+        # asserting the result attribute is populated confirms the df
+        # flow-through was exercised.
+        assert res.honest_did_results is not None
+
+    def test_heterogeneity_late_nvalid_propagates_to_all_surfaces(
+        self, base_panel, monkeypatch,
+    ):
+        """Regression for PR #311 CI review R4 P2.
+
+        The duplicated-column test above exercises the design-df cap
+        (which binds BEFORE heterogeneity runs), so it cannot catch a
+        bug where the post-heterogeneity recompute block fails to
+        propagate the final df to every public surface. This test
+        isolates the late-arriving case by monkeypatching
+        `compute_replicate_if_variance` to return a SMALLER n_valid
+        only on the heterogeneity call site — after main surfaces
+        have already frozen their t/p/CI fields with the larger df.
+
+        Every public surface (`overall_*`, `event_study_effects`,
+        `placebo_event_study`, `heterogeneity_effects`,
+        `survey_metadata.df_survey`) must reflect the reduced df.
+        """
+        from scipy import stats as _stats
+        from diff_diff import survey as _survey_mod
+
+        R = 20
+        reduced_n_valid = 7  # small enough that (reduced - 1) < R - 1
+        df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5)
+        sd = _build_replicate_design(R, "JK1")
+        resolved = sd.resolve(df)
+        # Clean (full-rank) design — design df is NOT the binding cap.
+        assert resolved.df_survey == R - 1
+
+        original = _survey_mod.compute_replicate_if_variance
+
+        # Pass 1: count the main-surface calls to
+        # `compute_replicate_if_variance` (L_max=1, no heterogeneity).
+        main_counter = {"n": 0}
+
+        def count_only(psi, resolved_arg):
+            main_counter["n"] += 1
+            return original(psi, resolved_arg)
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance", count_only
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        main_call_count = main_counter["n"]
+        monkeypatch.undo()
+
+        # Pass 2: reduce n_valid on calls STRICTLY AFTER the main
+        # surfaces (i.e., heterogeneity's call).
+        tracker = {"count": 0}
+
+        def reduce_after_main(psi, resolved_arg):
+            tracker["count"] += 1
+            var, n_valid = original(psi, resolved_arg)
+            if tracker["count"] > main_call_count:
+                return var, reduced_n_valid
+            return var, n_valid
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance",
+            reduce_after_main,
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+                heterogeneity="x_het",
+            )
+
+        # Heterogeneity's smaller n_valid should be the binding final
+        # df: min(R - 1, reduced_n_valid - 1) = reduced_n_valid - 1.
+        expected_df = reduced_n_valid - 1
+        assert res.survey_metadata.df_survey == expected_df, (
+            f"survey_metadata.df_survey should be {expected_df} "
+            f"(reduced_n_valid - 1), got {res.survey_metadata.df_survey}"
+        )
+
+        # Overall surface: p-value must use the reduced df.
+        t_stat = res.overall_att / res.overall_se
+        expected_p_overall = 2 * _stats.t.sf(abs(t_stat), df=expected_df)
+        assert res.overall_p_value == pytest.approx(
+            expected_p_overall, rel=1e-6,
+        ), (
+            "overall_p_value should reflect heterogeneity's late "
+            "n_valid via the post-recompute block"
+        )
+
+        # Event study (L=1) surface.
+        if res.event_study_effects and 1 in res.event_study_effects:
+            info = res.event_study_effects[1]
+            if np.isfinite(info["se"]) and info["se"] > 0:
+                t_l = info["effect"] / info["se"]
+                expected_p_l = 2 * _stats.t.sf(abs(t_l), df=expected_df)
+                assert info["p_value"] == pytest.approx(
+                    expected_p_l, rel=1e-6,
+                )
+
+        # Placebo event study surface (NEGATIVE keys). This is the
+        # one the copied-value bug would hide: `placebo_event_study`
+        # holds a VALUE copy of `placebo_horizon_inference`, so a
+        # broken recompute block would leave stale t/p/CI here.
+        if res.placebo_event_study:
+            for _lag_neg, pl_info in res.placebo_event_study.items():
+                if np.isfinite(pl_info["se"]) and pl_info["se"] > 0:
+                    t_pl = pl_info["effect"] / pl_info["se"]
+                    expected_p_pl = 2 * _stats.t.sf(
+                        abs(t_pl), df=expected_df,
+                    )
+                    assert pl_info["p_value"] == pytest.approx(
+                        expected_p_pl, rel=1e-6,
+                    ), (
+                        f"placebo_event_study[{_lag_neg}] p-value "
+                        f"must reflect the reduced df "
+                        f"({expected_df}); got "
+                        f"{pl_info['p_value']} vs expected "
+                        f"{expected_p_pl}"
+                    )
+
+        # Heterogeneity surface.
+        if res.heterogeneity_effects:
+            het_info = res.heterogeneity_effects[1]
+            if np.isfinite(het_info["se"]) and het_info["se"] > 0:
+                het_t = het_info["beta"] / het_info["se"]
+                expected_het_p = 2 * _stats.t.sf(
+                    abs(het_t), df=expected_df,
+                )
+                assert het_info["p_value"] == pytest.approx(
+                    expected_het_p, rel=1e-6,
+                )
+
+        # Normalized effects surface. This is a DERIVED dict built
+        # pre-heterogeneity (delta-method SE from per-horizon SE) —
+        # same class of "copy" bug as placebo_event_study. R4 P0
+        # regression: without the recompute, normalized_effects would
+        # silently ship p_value/conf_int computed with the
+        # pre-heterogeneity df.
+        if res.normalized_effects:
+            for _lag, norm_info in res.normalized_effects.items():
+                if np.isfinite(norm_info["se"]) and norm_info["se"] > 0:
+                    norm_t = norm_info["effect"] / norm_info["se"]
+                    expected_norm_p = 2 * _stats.t.sf(
+                        abs(norm_t), df=expected_df,
+                    )
+                    assert norm_info["p_value"] == pytest.approx(
+                        expected_norm_p, rel=1e-6,
+                    ), (
+                        f"normalized_effects[{_lag}] p-value must "
+                        f"reflect the reduced df ({expected_df}); got "
+                        f"{norm_info['p_value']} vs expected "
+                        f"{expected_norm_p}"
+                    )
+
+    def test_late_nvalid_below_two_forces_all_surfaces_nan(
+        self, base_panel, monkeypatch,
+    ):
+        """Regression for R4 P2 follow-up: when heterogeneity's late
+        replicate failures drive the final `n_valid` to 1 (reduced df
+        = 0), every public inference surface must flip to NaN — not
+        silently keep the finite values that were computed with a
+        larger intermediate df.
+
+        `_inference_df` coerces df=0 under replicate → safe_inference
+        NaN branch. Exercises the all-NaN contract end to end.
+        """
+        from diff_diff import survey as _survey_mod
+
+        R = 15
+        df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5)
+        sd = _build_replicate_design(R, "JK1")
+        resolved = sd.resolve(df)
+        assert resolved.df_survey == R - 1
+
+        original = _survey_mod.compute_replicate_if_variance
+
+        # Count main-only calls.
+        main_counter = {"n": 0}
+
+        def count_only(psi, resolved_arg):
+            main_counter["n"] += 1
+            return original(psi, resolved_arg)
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance", count_only
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+            )
+        main_call_count = main_counter["n"]
+        monkeypatch.undo()
+
+        # Return n_valid=1 on heterogeneity's call → reduced df = 0
+        # → _inference_df coerces to NaN.
+        tracker = {"count": 0}
+
+        def reduce_to_one(psi, resolved_arg):
+            tracker["count"] += 1
+            var, n_valid = original(psi, resolved_arg)
+            if tracker["count"] > main_call_count:
+                return var, 1
+            return var, n_valid
+
+        monkeypatch.setattr(
+            _survey_mod, "compute_replicate_if_variance", reduce_to_one
+        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            res = ChaisemartinDHaultfoeuille(seed=1).fit(
+                df, outcome="outcome", group="group",
+                time="period", treatment="treatment",
+                survey_design=sd, L_max=1,
+                heterogeneity="x_het",
+            )
+        # Final effective df is None (reduced_df = 0 < 1 guard).
+        assert res.survey_metadata.df_survey is None
+        # Every public inference surface must be NaN.
+        assert np.isnan(res.overall_t_stat)
+        assert np.isnan(res.overall_p_value)
+        assert not np.all(np.isfinite(res.overall_conf_int))
+        if res.event_study_effects:
+            for info in res.event_study_effects.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])
+        if res.placebo_event_study:
+            for info in res.placebo_event_study.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])
+        if res.heterogeneity_effects:
+            for info in res.heterogeneity_effects.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])
+        if res.normalized_effects:
+            for info in res.normalized_effects.values():
+                assert np.isnan(info["t_stat"])
+                assert np.isnan(info["p_value"])