From f41d0b67cc8c66e9243e35357a3dfcb5a49a4a47 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 08:27:01 -0400 Subject: [PATCH 01/11] Add replicate-weight variance and PSU-level bootstrap to dCDH Extends the group-level Taylor Series Linearization survey support landed in PR #307 with two opt-in variance mechanisms: 1. Replicate-weight variance (BRR / Fay / JK1 / JKn / SDR) routed through the unified compute_replicate_if_variance helper at every IF site (overall DID_M, joiners DID_+, leavers DID_-, multi-horizon DID_l, placebo DID^pl_l, heterogeneity beta^het_l, twowayfeweights helper). Uses inline-branch pattern mirroring EfficientDiD (efficient_did.py:1119-1142) and TripleDifference (triple_diff.py:1206-1238). Effective df_survey reduces to min(n_valid) - 1 across IF sites when replicates fail, matching the precedent in efficient_did.py:1133-1135 and triple_diff.py:676-686. 2. PSU-level Hall-Mammen wild bootstrap via group_to_psu_map threaded through _compute_dcdh_bootstrap. Under auto-inject psu=group the identity-map fast path preserves pre-change behavior bit-for-bit. Under an explicitly coarser PSU (e.g., psu=state with county-level groups), all groups in the same PSU receive the same bootstrap multiplier so within-PSU correlation is preserved. Locked contracts: - Replicate weights and n_bootstrap > 0 are mutually exclusive (replicate variance is closed-form; bootstrap would double-count). Raises NotImplementedError, matching efficient_did.py:989 etc. - Strata participate only through analytical TSL variance, not the bootstrap (multiplier weights stay unit-by-unit). - Warning fires only when n_psu_eff < n_groups_eff (strictly coarser PSU). Under auto-inject psu=group the warning is correctly suppressed. Adds 36 regression tests in tests/test_survey_dcdh_replicate_psu.py covering Class A sites (via _survey_se_from_group_if), Class B sites (heterogeneity + twowayfeweights), PSU bootstrap semantics, and cross-cutting invariants (replicate+bootstrap rejection, HonestDiD under replicate). REGISTRY.md lines 651-653 updated to document both mechanisms. Full suite: 525 passing (was 489 before this change). --- diff_diff/chaisemartin_dhaultfoeuille.py | 310 +++++++-- .../chaisemartin_dhaultfoeuille_bootstrap.py | 118 +++- diff_diff/guides/llms-full.txt | 6 +- docs/methodology/REGISTRY.md | 6 +- tests/test_survey_dcdh.py | 66 +- tests/test_survey_dcdh_replicate_psu.py | 592 ++++++++++++++++++ 6 files changed, 1012 insertions(+), 86 deletions(-) create mode 100644 tests/test_survey_dcdh_replicate_psu.py diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 9e8b9818..9af4b6c0 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -714,13 +714,15 @@ def fit( f"weight_type='pweight', got '{resolved_survey.weight_type}'. " f"The survey IF variance math assumes probability weights." ) - if (resolved_survey.replicate_weights is not None - and resolved_survey.replicate_weights.shape[1] > 0): - raise NotImplementedError( - "Replicate weight variance for dCDH is not yet supported. " - "Use strata/PSU/FPC for design-based inference via Taylor " - "Series Linearization." - ) + # Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are supported + # for analytical variance via compute_replicate_if_variance() + # at each IF site (see _survey_se_from_group_if). The combination + # of replicate weights and n_bootstrap > 0 is rejected inside + # the bootstrap entry block (replicate variance is closed-form; + # bootstrap would double-count variance). Matches library + # precedent: efficient_did.py:989, staggered.py:1869, + # two_stage.py:251-253. + # Within-group-constant PSU/strata is required: the IF # expansion psi_i = U[g] * (w_i / W_g) supports within-group # variation in WEIGHTS (each obs contributes proportionally), @@ -767,6 +769,16 @@ def fit( "resolved": resolved_survey, } + # Replicate-weight variance tracker: each _compute_se call under + # a replicate design returns n_valid (number of replicate columns + # that produced a finite estimate). The effective df_survey is + # min(n_valid) - 1 across all IF sites — matches the precedent in + # `diff_diff/efficient_did.py:1133-1135` and + # `diff_diff/triple_diff.py:676-686`. Under TSL (analytical), + # _compute_se returns None for n_valid and df_survey falls through + # to resolved_survey.df_survey (= n_psu - n_strata). + _replicate_n_valid_list: List[int] = [] + # ------------------------------------------------------------------ # Step 4b: Covariate aggregation (DID^X, Web Appendix Section 1.2) # ------------------------------------------------------------------ @@ -1589,16 +1601,18 @@ def fit( U_centered_l = _cohort_recenter(U_l_elig, cid_elig) N_l_h = multi_horizon_dids[l_h]["N_l"] _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]] - se_l = _compute_se( + se_l, n_valid_l = _compute_se( U_centered=U_centered_l, divisor=N_l_h, obs_survey_info=_obs_survey_info, eligible_groups=_elig_groups_l, ) + if n_valid_l is not None: + _replicate_n_valid_list.append(n_valid_l) multi_horizon_se[l_h] = se_l did_l_val = multi_horizon_dids[l_h]["did_l"] - _df_s = resolved_survey.df_survey if resolved_survey is not None else None + _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_df_s) multi_horizon_inference[l_h] = { "effect": did_l_val, @@ -1716,15 +1730,17 @@ def fit( cid_elig_pl = cid_pl[eligible_mask_pl] U_centered_pl_l = _cohort_recenter(U_pl_elig, cid_elig_pl) _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]] - se_pl_l = _compute_se( + se_pl_l, n_valid_pl_l = _compute_se( U_centered=U_centered_pl_l, divisor=pl_data["N_pl_l"], obs_survey_info=_obs_survey_info, eligible_groups=_elig_groups_pl, ) + if n_valid_pl_l is not None: + _replicate_n_valid_list.append(n_valid_pl_l) placebo_horizon_se[lag_l] = se_pl_l pl_val = pl_data["placebo_l"] - _df_s = resolved_survey.df_survey if resolved_survey is not None else None + _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) t_pl_l, p_pl_l, ci_pl_l = safe_inference( pl_val, se_pl_l, alpha=self.alpha, df=_df_s ) @@ -1799,12 +1815,14 @@ def fit( ) # Analytical SE for DID_M (survey-aware when survey_design provided) - overall_se = _compute_se( + overall_se, n_valid_overall = _compute_se( U_centered=U_centered_overall, divisor=N_S, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, ) + if n_valid_overall is not None: + _replicate_n_valid_list.append(n_valid_overall) # Detect the degenerate-cohort case: every variance-eligible group # forms its own (D_{g,1}, F_g, S_g) cohort, so the centered # influence function is identically zero and `_plugin_se` returns @@ -1829,21 +1847,22 @@ def fit( UserWarning, stacklevel=2, ) - _df_survey = ( - resolved_survey.df_survey if resolved_survey is not None else None - ) + _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) overall_t, overall_p, overall_ci = safe_inference( overall_att, overall_se, alpha=self.alpha, df=_df_survey ) # Joiners SE (uses joiner-only centered IF; conservative bound) if joiners_available: - joiners_se = _compute_se( + joiners_se, n_valid_joiners = _compute_se( U_centered=U_centered_joiners, divisor=joiner_total, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, ) + if n_valid_joiners is not None: + _replicate_n_valid_list.append(n_valid_joiners) + _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) joiners_t, joiners_p, joiners_ci = safe_inference( joiners_att, joiners_se, alpha=self.alpha, df=_df_survey ) @@ -1857,12 +1876,15 @@ def fit( # Leavers SE if leavers_available: - leavers_se = _compute_se( + leavers_se, n_valid_leavers = _compute_se( U_centered=U_centered_leavers, divisor=leaver_total, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, ) + if n_valid_leavers is not None: + _replicate_n_valid_list.append(n_valid_leavers) + _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) leavers_t, leavers_p, leavers_ci = safe_inference( leavers_att, leavers_se, alpha=self.alpha, df=_df_survey ) @@ -1905,14 +1927,64 @@ def fit( bootstrap_results: Optional[DCDHBootstrapResults] = None if self.n_bootstrap > 0: if resolved_survey is not None: - warnings.warn( - "Bootstrap with survey_design uses group-level multiplier " - "weights, not PSU-level. This is conservative when groups " - "are finer than PSUs. PSU-level survey bootstrap is " - "deferred to a future release.", - UserWarning, - stacklevel=2, - ) + # Replicate-weight designs have their own closed-form + # variance (Rao-Wu / BRR / Fay / JK1 / JKn / SDR via + # compute_replicate_if_variance). Combining replicate + # variance with a multiplier bootstrap would double-count + # variance. Match library precedent: + # efficient_did.py:989, staggered.py:1869, + # two_stage.py:251-253. + if ( + resolved_survey.replicate_weights is not None + and resolved_survey.replicate_weights.shape[1] > 0 + ): + raise NotImplementedError( + "dCDH survey support rejects the combination of " + "replicate weights and n_bootstrap > 0. Replicate-" + "weight variance is closed-form " + "(compute_replicate_if_variance); use n_bootstrap=0. " + "For a bootstrap-based SE, use strata/PSU/FPC design " + "instead of replicate weights." + ) + + # Warning fires only when PSU is strictly coarser than + # group. Under auto-inject psu=group (or explicit + # psu=), groups and PSUs coincide so + # Hall-Mammen wild PSU bootstrap equals group-level + # multiplier bootstrap — no need to warn. + psu_arr_warn = getattr(resolved_survey, "psu", None) + if psu_arr_warn is None or _obs_survey_info is None: + # No PSU info — can't compare to group count. + n_psu_eff_warn, n_groups_eff_warn = -1, -1 + else: + pos_mask_warn = ( + np.asarray( + _obs_survey_info["weights"], dtype=np.float64 + ) + > 0 + ) + psu_eff = np.asarray(psu_arr_warn)[pos_mask_warn] + gids_eff_warn = np.asarray( + _obs_survey_info["group_ids"] + )[pos_mask_warn] + n_psu_eff_warn = int(len(np.unique(psu_eff))) + n_groups_eff_warn = int(len(np.unique(gids_eff_warn))) + if 0 <= n_psu_eff_warn < n_groups_eff_warn: + warnings.warn( + f"Bootstrap with survey_design uses Hall-Mammen " + f"wild multiplier weights at the PSU level " + f"(n_psu={n_psu_eff_warn} PSUs across " + f"n_groups={n_groups_eff_warn} groups). For " + f"designs with substantially unequal PSU sizes, " + f"the wild bootstrap may under-cover relative to " + f"analytical TSL inference; consider " + f"n_bootstrap=0 for the TSL variance. If n_psu " + f"is close to n_groups, lonely-PSU removal " + f"(lonely_psu='remove') may be collapsing " + f"singletons.", + UserWarning, + stacklevel=2, + ) joiners_inputs = ( (U_centered_joiners, joiner_total, joiners_att) if joiners_available else None ) @@ -2028,6 +2100,48 @@ def fit( h_data["did_l"], ) + # Under a survey design with PSU information, build a dense + # group_to_psu_map so the bootstrap randomizes at the PSU + # level rather than at the group level (Hall-Mammen wild PSU + # bootstrap). PSU/strata are within-group-constant by the + # _validate_group_constant_strata_psu precondition, so each + # variance-eligible group has exactly one PSU label. Under + # auto-inject psu=group the map is the identity and the + # bootstrap mixin's fast path reproduces the pre-PSU + # behavior bit-for-bit. + group_to_psu_map_bootstrap: Optional[np.ndarray] = None + if ( + resolved_survey is not None + and getattr(resolved_survey, "psu", None) is not None + and _obs_survey_info is not None + ): + obs_psu_codes = np.asarray(resolved_survey.psu) + obs_gids_boot = np.asarray(_obs_survey_info["group_ids"]) + obs_weights_boot = np.asarray( + _obs_survey_info["weights"], dtype=np.float64 + ) + pos_mask_boot = obs_weights_boot > 0 + group_psu_labels: List[Any] = [] + valid_map = True + for gid in _eligible_group_ids: + mask_g = (obs_gids_boot == gid) & pos_mask_boot + if not mask_g.any(): + valid_map = False + break + labels = obs_psu_codes[mask_g] + # Within-group-constant PSU is validated upstream; + # .item() on the first is robust to integer vs string. + group_psu_labels.append(labels[0]) + if valid_map and len(group_psu_labels) == n_groups_for_overall_var: + # Dense integer indices for _generate_psu_or_group_weights. + _, group_to_psu_map_bootstrap = np.unique( + np.asarray(group_psu_labels), + return_inverse=True, + ) + group_to_psu_map_bootstrap = np.asarray( + group_to_psu_map_bootstrap, dtype=np.int64 + ) + br = self._compute_dcdh_bootstrap( n_groups_for_overall=n_groups_for_overall_var, u_centered_overall=U_centered_overall, @@ -2038,6 +2152,7 @@ def fit( placebo_inputs=placebo_inputs, multi_horizon_inputs=mh_boot_inputs, placebo_horizon_inputs=pl_boot_inputs, + group_to_psu_map=group_to_psu_map_bootstrap, ) bootstrap_results = br @@ -2496,6 +2611,7 @@ def fit( rank_deficient_action=self.rank_deficient_action, group_ids_order=np.array(all_groups), obs_survey_info=_obs_survey_info, + replicate_n_valid_list=_replicate_n_valid_list, ) twfe_weights_df = None @@ -3278,6 +3394,7 @@ def _compute_heterogeneity_test( rank_deficient_action: str = "warn", group_ids_order: Optional[np.ndarray] = None, obs_survey_info: Optional[Dict[str, Any]] = None, + replicate_n_valid_list: Optional[List[int]] = None, ) -> Dict[int, Dict[str, Any]]: """Test for heterogeneous treatment effects (Web Appendix Section 1.5). @@ -3323,14 +3440,25 @@ def _compute_heterogeneity_test( obs_survey_info is not None and group_ids_order is not None ) if use_survey: - from diff_diff.survey import compute_survey_if_variance + from diff_diff.survey import ( + compute_replicate_if_variance, + compute_survey_if_variance, + ) obs_gids_raw = np.asarray(obs_survey_info["group_ids"]) obs_w_raw = np.asarray(obs_survey_info["weights"], dtype=np.float64) resolved = obs_survey_info["resolved"] - df_s = ( - resolved.df_survey if resolved is not None else None - ) + # df_s starts from the design-level df (n_psu - n_strata under TSL, + # R - 1 under replicate). It may be reduced further by replicate + # failures reported via replicate_n_valid_list (this function + # appends its own n_valid observations). + if ( + replicate_n_valid_list is not None + and len(replicate_n_valid_list) > 0 + ): + df_s = int(min(replicate_n_valid_list)) - 1 + else: + df_s = resolved.df_survey if resolved is not None else None # Contract: only obs whose group is in the canonical post-filter # list contribute. Groups dropped upstream (Step 5b interior gaps, # Step 6 multi-switch) appear in obs_gids_raw but must be @@ -3484,15 +3612,33 @@ def _compute_heterogeneity_test( obs_w_raw[mask_g] / w_sum_g ) - # Binder TSL variance across stratified PSUs. - var_s = compute_survey_if_variance(psi_obs, resolved) + # Dispatch: replicate-weight variance (BRR/Fay/JK1/JKn/SDR) + # vs Binder TSL across stratified PSUs. Mirrors the inline + # branch in _survey_se_from_group_if and the pattern in + # TripleDifference:1206-1238. Heterogeneity uses WLS with + # full-sample weights; theta_hat is treated as fixed per the + # FWL plug-in IF convention (REGISTRY.md Note on heterogeneity + # under replicate — no per-replicate refits). + if getattr(resolved, "uses_replicate_variance", False): + var_s, n_valid_het = compute_replicate_if_variance( + psi_obs, resolved + ) + if replicate_n_valid_list is not None: + replicate_n_valid_list.append(n_valid_het) + # Reduce df_s to reflect this horizon's n_valid. + df_s_local = int(n_valid_het) - 1 if df_s is None else min( + df_s, int(n_valid_het) - 1 + ) + else: + var_s = compute_survey_if_variance(psi_obs, resolved) + df_s_local = df_s se_het = ( float(np.sqrt(var_s)) if np.isfinite(var_s) and var_s > 0 else float("nan") ) t_stat, p_val, ci = safe_inference( - beta_het, se_het, alpha=alpha, df=df_s + beta_het, se_het, alpha=alpha, df=df_s_local ) results[l_h] = { @@ -4812,24 +4958,51 @@ def _validate_group_constant_strata_psu( ) +def _effective_df_survey( + resolved_survey: Any, + replicate_n_valid_list: List[int], +) -> Optional[int]: + """Compute the effective ``df_survey`` for t-critical values. + + Under replicate-weight designs, each IF site returns an ``n_valid`` + count (number of replicate columns that produced a finite estimate). + This helper returns ``min(n_valid) - 1`` across sites, matching the + precedent in ``diff_diff/efficient_did.py:1133-1135`` and + ``diff_diff/triple_diff.py:676-686``. Under TSL (analytical) or + non-survey fits, ``replicate_n_valid_list`` is empty and this falls + through to ``resolved_survey.df_survey`` (or ``None`` when there's + no survey design at all). + """ + if resolved_survey is None: + return None + if replicate_n_valid_list: + return int(min(replicate_n_valid_list)) - 1 + return resolved_survey.df_survey + + def _compute_se( U_centered: np.ndarray, divisor: int, obs_survey_info: Optional[dict], eligible_groups: Optional[list] = None, -) -> float: +) -> Tuple[float, Optional[int]]: """Dispatch to plug-in SE or survey-design-aware SE. When ``obs_survey_info`` is ``None``, falls back to the simple plug-in formula. Otherwise, expands group-level IFs and delegates - to TSL variance. + to TSL variance (or replicate-weight variance when the resolved + design carries replicate weights). + + Returns ``(se, n_valid_replicates)``. ``n_valid_replicates`` is + ``None`` under the plug-in / TSL paths, and the number of valid + replicate columns under the replicate path. """ if obs_survey_info is None: - return _plugin_se(U_centered=U_centered, divisor=divisor) + return _plugin_se(U_centered=U_centered, divisor=divisor), None if eligible_groups is None: - return _plugin_se(U_centered=U_centered, divisor=divisor) + return _plugin_se(U_centered=U_centered, divisor=divisor), None if divisor <= 0: - return float("nan") + return float("nan"), None # dCDH IFs are numerator-scale (U.sum() == N_S * DID_M). # compute_survey_if_variance() expects estimator-scale psi. # Scale by 1/divisor to normalize before survey expansion. @@ -4845,16 +5018,20 @@ def _survey_se_from_group_if( U_centered: np.ndarray, eligible_groups: list, obs_survey_info: dict, -) -> float: +) -> Tuple[float, Optional[int]]: """Compute survey-design-aware SE from group-level centered IFs. Expands group-level influence function values to observation level using the proportional-weight allocation ``psi_i = U[g] * (w_i / W_g)`` - so that ``sum(psi_i for i in g) = U[g]``, then delegates to - ``compute_survey_if_variance()`` for the TSL design-based variance. + so that ``sum(psi_i for i in g) = U[g]``, then delegates to either + ``compute_survey_if_variance()`` (TSL / analytical path) or + ``compute_replicate_if_variance()`` (BRR/Fay/JK1/JKn/SDR) depending + on whether the resolved design carries replicate weights. This is a library extension not in the dCDH papers (the paper's - plug-in variance assumes iid sampling). + plug-in variance assumes iid sampling). The replicate path uses + Rao-Wu weight-ratio rescaling; see REGISTRY.md + ``ChaisemartinDHaultfoeuille`` Note on survey expansion. Parameters ---------- @@ -4869,10 +5046,18 @@ def _survey_se_from_group_if( Returns ------- - float - Survey-design-aware SE, or NaN if degenerate. + Tuple[float, Optional[int]] + ``(se, n_valid_replicates)``. ``se`` is the survey-design-aware + SE, or NaN if degenerate. ``n_valid_replicates`` is the number + of valid replicate columns used for variance under the replicate + path, or ``None`` under the TSL (analytical) path. Callers + track the min across sites to set an effective ``df_survey`` + for replicate designs. """ - from diff_diff.survey import compute_survey_if_variance + from diff_diff.survey import ( + compute_replicate_if_variance, + compute_survey_if_variance, + ) # Degenerate-cohort contract (mirror _plugin_se): when the centered # IF is empty or every cohort is a singleton (→ recentered IF is @@ -4882,9 +5067,9 @@ def _survey_se_from_group_if( # through _compute_se (overall, joiners/leavers, multi-horizon # ATT, placebos, normalized/cumulated, heterogeneity). if U_centered.size == 0: - return float("nan") + return float("nan"), None if float((U_centered ** 2).sum()) <= 0: - return float("nan") + return float("nan"), None group_ids = obs_survey_info["group_ids"] weights = obs_survey_info["weights"] @@ -4902,7 +5087,7 @@ def _survey_se_from_group_if( psi = np.zeros(n_obs, dtype=np.float64) if not pos_mask.any(): - return float("nan") + return float("nan"), None gids_eff = np.asarray(group_ids)[pos_mask] w_eff = weights_arr[pos_mask] @@ -4922,10 +5107,20 @@ def _survey_se_from_group_if( safe_w = np.where(w_obs_total_eff > 0, w_obs_total_eff, 1.0) psi[pos_mask] = u_obs_eff * (w_eff / safe_w) + # Dispatch: replicate variance (BRR/Fay/JK1/JKn/SDR) vs TSL. + # Mirrors the inline branch in TripleDifference:1206-1238 and + # EfficientDiD:1119-1142. Under replicate, returns (variance, n_valid); + # the caller propagates min(n_valid) to df_survey. + if getattr(resolved, "uses_replicate_variance", False): + variance, n_valid = compute_replicate_if_variance(psi, resolved) + if not np.isfinite(variance) or variance < 0: + return float("nan"), n_valid + return float(np.sqrt(variance)), n_valid + variance = compute_survey_if_variance(psi, resolved) if not np.isfinite(variance) or variance < 0: - return float("nan") - return float(np.sqrt(variance)) + return float("nan"), None + return float(np.sqrt(variance)), None def _build_group_time_design( @@ -5240,16 +5435,13 @@ def twowayfeweights( f"The TWFE diagnostic under survey uses survey-weighted cell " f"means; other weight types are not supported." ) - if ( - resolved is not None - and resolved.replicate_weights is not None - and resolved.replicate_weights.shape[1] > 0 - ): - raise NotImplementedError( - "Replicate weight variance for twowayfeweights() is not " - "supported. Use strata/PSU/FPC for design-based inference " - "via Taylor Series Linearization (matches the fit() path)." - ) + # Replicate-weight designs are accepted. TWFE diagnostic has no + # SE field on TWFEWeightsResult — the replicate weights only + # participate through the full-sample weight (resolved.weights), + # which drives survey-weighted cell aggregation in + # _validate_and_aggregate_to_cells. Diagnostic numbers + # (beta_fe, sigma_fe, fraction_negative) match + # fit(..., survey_design=sd).twfe_* under replicate input. # Validation + cell aggregation via the same helper used by # ChaisemartinDHaultfoeuille.fit() — enforces NaN/binary/within-cell diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py index 7eea75cf..06eec38a 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py @@ -70,6 +70,8 @@ def _compute_dcdh_bootstrap( # --- Phase 2: multi-horizon inputs --- multi_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None, placebo_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None, + # --- Survey: PSU-level bootstrap under survey designs --- + group_to_psu_map: Optional[np.ndarray] = None, ) -> DCDHBootstrapResults: """ Compute multiplier-bootstrap inference for all dCDH targets. @@ -175,6 +177,7 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH overall DID_M bootstrap", return_distribution=True, + group_to_psu_map=group_to_psu_map, ) else: overall_se = np.nan @@ -206,6 +209,7 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH joiners DID_+ bootstrap", return_distribution=False, + group_to_psu_map=_slice_psu_map(group_to_psu_map, u_j.size), ) results.joiners_se = se_j results.joiners_ci = ci_j @@ -225,6 +229,7 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH leavers DID_- bootstrap", return_distribution=False, + group_to_psu_map=_slice_psu_map(group_to_psu_map, u_l.size), ) results.leavers_se = se_l results.leavers_ci = ci_l @@ -244,6 +249,7 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH placebo DID_M^pl bootstrap", return_distribution=False, + group_to_psu_map=_slice_psu_map(group_to_psu_map, u_pl.size), ) results.placebo_se = se_pl results.placebo_ci = ci_pl @@ -259,13 +265,18 @@ def _compute_dcdh_bootstrap( es_pvals: Dict[int, float] = {} es_dists: Dict[int, np.ndarray] = {} - # Shared weight matrix sized for the group set + # Shared weight matrix sized for the group set. Under PSU-level + # bootstrap (Hall-Mammen wild PSU), weights are drawn once per + # PSU and broadcast to groups so all groups in the same PSU + # share a multiplier within a single bootstrap replicate — + # preserving the sup-t joint distribution across horizons. n_groups_mh = n_groups_for_overall - shared_weights = _generate_bootstrap_weights_batch( + shared_weights = _generate_psu_or_group_weights( n_bootstrap=self.n_bootstrap, - n_units=n_groups_mh, + n_groups_target=n_groups_mh, weight_type=self.bootstrap_weights, rng=rng, + group_to_psu_map=group_to_psu_map, ) for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()): @@ -324,6 +335,7 @@ def _compute_dcdh_bootstrap( rng=rng, context=f"dCDH placebo l={l_h} bootstrap", return_distribution=False, + group_to_psu_map=_slice_psu_map(group_to_psu_map, u_h.size), ) pl_ses[l_h] = se_h pl_cis[l_h] = ci_h @@ -341,6 +353,93 @@ def _compute_dcdh_bootstrap( # ============================================================================= +def _slice_psu_map( + group_to_psu_map: Optional[np.ndarray], + target_size: int, +) -> Optional[np.ndarray]: + """Slice a full-set group-to-PSU map to a target's contributing group count. + + Multi-horizon / joiner / leaver targets share the overall variance- + eligible group ordering but may have fewer contributing groups + (``u_centered.shape[0] <= n_groups_for_overall``). Slice the first + ``target_size`` entries — this mirrors the existing shared-weights + truncation at the multi-horizon bootstrap site. Returns ``None`` + when no map is provided (plain multiplier-bootstrap path). + """ + if group_to_psu_map is None: + return None + if target_size <= 0 or target_size > len(group_to_psu_map): + return group_to_psu_map + return group_to_psu_map[:target_size] + + +def _generate_psu_or_group_weights( + n_bootstrap: int, + n_groups_target: int, + weight_type: str, + rng: np.random.Generator, + group_to_psu_map: Optional[np.ndarray], +) -> np.ndarray: + """Generate a group-level weight matrix, possibly via PSU broadcasting. + + When ``group_to_psu_map`` is ``None`` or is the identity (each group + is its own PSU), generates weights at the group level directly — + bit-identical to the pre-PSU-bootstrap contract. + + When ``group_to_psu_map`` has fewer unique values than + ``n_groups_target`` (strictly coarser PSU than group), generates + weights at the PSU level and broadcasts to groups via the map. + This is the Hall-Mammen wild PSU bootstrap. + + Parameters + ---------- + n_bootstrap, weight_type, rng + Passed through to generate_bootstrap_weights_batch. + n_groups_target : int + Number of groups contributing to the target's IF vector. + group_to_psu_map : np.ndarray or None + Dense integer PSU indices of shape ``(n_groups_target,)``. + ``None`` triggers the group-level path. + + Returns + ------- + np.ndarray + Shape ``(n_bootstrap, n_groups_target)`` multiplier weights. + """ + if group_to_psu_map is None: + return _generate_bootstrap_weights_batch( + n_bootstrap=n_bootstrap, + n_units=n_groups_target, + weight_type=weight_type, + rng=rng, + ) + if len(group_to_psu_map) != n_groups_target: + raise ValueError( + f"group_to_psu_map length ({len(group_to_psu_map)}) does not " + f"match n_groups_target ({n_groups_target})." + ) + n_psu = int(np.max(group_to_psu_map)) + 1 if group_to_psu_map.size > 0 else 0 + if n_psu >= n_groups_target: + # Identity (each group its own PSU) — skip the broadcast for a + # bit-identical fast path matching the pre-PSU-bootstrap behavior. + return _generate_bootstrap_weights_batch( + n_bootstrap=n_bootstrap, + n_units=n_groups_target, + weight_type=weight_type, + rng=rng, + ) + # Hall-Mammen wild PSU bootstrap: draw n_psu multipliers, broadcast + # via the dense index map so all groups in the same PSU share a + # multiplier. Preserves clustered sampling structure. + psu_weights = _generate_bootstrap_weights_batch( + n_bootstrap=n_bootstrap, + n_units=n_psu, + weight_type=weight_type, + rng=rng, + ) + return psu_weights[:, group_to_psu_map] + + def _bootstrap_one_target( u_centered: np.ndarray, divisor: int, @@ -351,6 +450,7 @@ def _bootstrap_one_target( rng: np.random.Generator, context: str, return_distribution: bool, + group_to_psu_map: Optional[np.ndarray] = None, ) -> Tuple[float, Tuple[float, float], float, Optional[np.ndarray]]: """ Run the multiplier bootstrap for a single dCDH target. @@ -367,16 +467,24 @@ def _bootstrap_one_target( sample mean of the bootstrap distribution should be approximately zero, not the original effect). The original effect is passed separately as the centering point for the percentile p-value. + + When ``group_to_psu_map`` is provided (length ``len(u_centered)``, + dense integer PSU indices), multiplier weights are generated at the + PSU level and broadcast to groups so all groups in the same PSU + receive the same bootstrap multiplier. This is the Hall-Mammen wild + PSU bootstrap; it reduces to the group-level bootstrap when each + group is its own PSU (identity map). """ n_groups_target = u_centered.shape[0] if n_groups_target == 0 or divisor == 0: return np.nan, (np.nan, np.nan), np.nan, None - weight_matrix = _generate_bootstrap_weights_batch( + weight_matrix = _generate_psu_or_group_weights( n_bootstrap=n_bootstrap, - n_units=n_groups_target, + n_groups_target=n_groups_target, weight_type=weight_type, rng=rng, + group_to_psu_map=group_to_psu_map, ) # Each bootstrap replicate: (1 / divisor) * sum_g w_b[g] * u_centered[g] diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 7e6ef60b..5efb8f13 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -266,7 +266,7 @@ est.fit( trends_nonparam: Any | None = None, # Phase 3: DID^s honest_did: bool = False, # Phase 3: HonestDiD integration # ---- survey support ---- - survey_design: SurveyDesign | None = None, # pweight + strata/PSU/FPC (TSL) + survey_design: SurveyDesign | None = None, # pweight + strata/PSU/FPC (TSL) OR BRR/Fay/JK1/JKn/SDR replicate weights; n_bootstrap > 0 uses Hall-Mammen PSU multiplier ) -> ChaisemartinDHaultfoeuilleResults ``` @@ -1672,8 +1672,8 @@ sd_female, data_female = sd.subpopulation(data, mask=lambda df: df['sex'] == 'F' **Key features:** - Taylor Series Linearization (TSL) variance with strata + PSU + FPC -- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (12 of 16 estimators) -- Survey-aware bootstrap: multiplier at PSU or Rao-Wu rescaled +- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (13 of 16 estimators, including dCDH) +- Survey-aware bootstrap: multiplier at PSU (Hall-Mammen wild; dCDH, staggered) or Rao-Wu rescaled (SyntheticDiD, SunAbraham) - DEFF diagnostics, subpopulation analysis, weight trimming (`trim_weights`) - Repeated cross-sections: `CallawaySantAnna(panel=False)` - Compatibility matrix: see `docs/choosing_estimator.rst` Survey Design Support section diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 0bf3c5de..60444f9e 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -648,9 +648,9 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Heterogeneity testing via saturated OLS (Web Appendix Section 1.5, Lemma 7) - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface -- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Replicate weights and PSU-level bootstrap deferred. -- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). -- **Note (survey + bootstrap fallback):** When `survey_design` and `n_bootstrap > 0` are both active, the multiplier bootstrap uses group-level Rademacher/Mammen/Webb weights rather than PSU-level resampling. A `UserWarning` is emitted from `fit()`. This is conservative when groups are finer than PSUs; a PSU-level survey bootstrap is deferred to a future release. For design-based analytical variance, the TSL path (non-bootstrap) is the recommended contract. +- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. +- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). Under replicate-weight designs, the same group-level IF expansion is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. +- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(n_valid_across_sites) - 1` propagates to t-critical values. --- diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index c2c7c0f0..1aa9f14c 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -317,8 +317,16 @@ def test_varying_weights_change_att(self, base_data): result_survey.overall_att, abs=0.01 ) - def test_rejects_replicate_weights(self, base_data): - """Replicate weight variance not yet supported.""" + def test_rejects_replicate_weights_with_bootstrap(self, base_data): + """Replicate weights combined with n_bootstrap > 0 is rejected. + + Replicate variance is closed-form (compute_replicate_if_variance); + combining it with a multiplier bootstrap would double-count + variance. Matches library precedent in efficient_did.py:989, + staggered.py:1869, two_stage.py:251-253. The standalone + replicate-only path (n_bootstrap=0) is supported separately; + see tests/test_survey_dcdh_replicate_psu.py. + """ df = base_data.copy() df["pw"] = 1.0 df["rep1"] = 1.0 @@ -328,8 +336,8 @@ def test_rejects_replicate_weights(self, base_data): replicate_weights=["rep1", "rep2"], replicate_method="BRR", ) - with pytest.raises(NotImplementedError, match="Replicate"): - ChaisemartinDHaultfoeuille().fit( + with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"): + ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit( df, outcome="outcome", group="group", @@ -366,9 +374,31 @@ def test_multi_horizon_survey_runs(self, data_with_survey): class TestBootstrapSurveyWarning: - def test_bootstrap_survey_emits_warning(self, data_with_survey): + def test_bootstrap_survey_auto_inject_no_warning(self, data_with_survey): + """Under auto-inject psu=group, Hall-Mammen wild PSU bootstrap + coincides with the group-level multiplier bootstrap — so no + warning should fire. The old 'PSU-level deferred' warning has + been replaced with a conditional one that only fires when the + user passes a strictly coarser PSU. + + See the new test file tests/test_survey_dcdh_replicate_psu.py for + the strictly-coarser-PSU case where the warning DOES fire. + """ sd = SurveyDesign(weights="pw") - with pytest.warns(UserWarning, match="group-level multiplier"): + import warnings as _w + + with _w.catch_warnings(): + _w.simplefilter("error", UserWarning) + # Ignore warnings unrelated to the bootstrap-PSU contract. + _w.filterwarnings( + "ignore", message="Single-period placebo SE" + ) + _w.filterwarnings( + "ignore", message="pweight weights normalized" + ) + _w.filterwarnings( + "ignore", message="Assumption 11" + ) ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit( data_with_survey, outcome="outcome", @@ -728,9 +758,11 @@ def test_twfe_helper_rejects_non_pweight(self, base_data): survey_design=sd, ) - def test_twfe_helper_rejects_replicate_weights(self, base_data): - """Replicate-weight survey designs must be rejected by the helper, - matching fit()'s NotImplementedError contract.""" + def test_twfe_helper_accepts_replicate_weights(self, base_data): + """Replicate-weight designs are accepted by the helper (no SE field + on TWFEWeightsResult, so only cell aggregation is affected). Matches + fit()'s new contract where replicate variance runs analytically via + compute_replicate_if_variance.""" from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights df_ = base_data.copy() @@ -742,13 +774,15 @@ def test_twfe_helper_rejects_replicate_weights(self, base_data): replicate_weights=["rep1", "rep2"], replicate_method="BRR", ) - with pytest.raises(NotImplementedError, match="Replicate"): - twowayfeweights( - df_, - outcome="outcome", group="group", - time="period", treatment="treatment", - survey_design=sd, - ) + result = twowayfeweights( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.beta_fe) + assert np.isfinite(result.sigma_fe) + assert 0.0 <= result.fraction_negative <= 1.0 # ── Test: TWFE diagnostic oracle under survey ─────────────────────── diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py new file mode 100644 index 00000000..aa1be454 --- /dev/null +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -0,0 +1,592 @@ +"""Replicate-weight variance and PSU-level bootstrap tests for dCDH. + +Covers the two survey-variance extensions added after PR #307: +1. Replicate-weight variance (BRR/Fay/JK1/JKn/SDR) via the unified + `compute_replicate_if_variance` helper, routed through the inline + branch in `_survey_se_from_group_if` (Class A sites) and the + parallel inline branch in `_compute_heterogeneity_test` (Class B). +2. PSU-level Hall-Mammen wild bootstrap via `group_to_psu_map` + threaded through `_compute_dcdh_bootstrap`, with an identity-map + fast path preserving auto-inject `psu=group` semantics. +""" + +import warnings + +import numpy as np +import pandas as pd +import pytest + +from diff_diff import ChaisemartinDHaultfoeuille, SurveyDesign +from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights + + +# ── Fixtures ──────────────────────────────────────────────────────── + + +REPLICATE_METHODS = ["BRR", "Fay", "JK1", "JKn", "SDR"] + + +def _make_reversible_panel( + n_groups: int = 30, + n_periods: int = 5, + seed: int = 42, +) -> pd.DataFrame: + """Simple reversible-treatment panel with two switch cohorts.""" + rng = np.random.default_rng(seed) + rows = [] + for g in range(n_groups): + f = 2 if g < n_groups // 2 else 3 + for t in range(n_periods): + d = 1 if t >= f else 0 + y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3) + rows.append( + { + "group": g, + "period": t, + "treatment": d, + "outcome": y, + "pw": 1.0, + "x_het": float(g) * 0.1, + } + ) + return pd.DataFrame(rows) + + +def _attach_replicate_weights( + df: pd.DataFrame, + R: int, + method: str, + seed: int = 101, +) -> pd.DataFrame: + """Add R replicate-weight columns and return the mutated DataFrame.""" + rng = np.random.default_rng(seed) + df = df.copy() + for r in range(R): + if method == "BRR" or method == "Fay" or method == "SDR": + df[f"rep{r}"] = rng.choice([0.5, 1.5], size=len(df)) + elif method in ("JK1", "JKn"): + df[f"rep{r}"] = rng.uniform(0.5, 1.5, size=len(df)) + return df + + +def _rep_cols(R: int) -> list: + return [f"rep{r}" for r in range(R)] + + +def _build_replicate_design(R: int, method: str) -> SurveyDesign: + """Build a SurveyDesign object with method-specific required params.""" + if method == "Fay": + return SurveyDesign( + weights="pw", + replicate_weights=_rep_cols(R), + replicate_method="Fay", + fay_rho=0.5, + ) + if method == "JKn": + return SurveyDesign( + weights="pw", + replicate_weights=_rep_cols(R), + replicate_method="JKn", + replicate_strata=[r % 2 for r in range(R)], + ) + return SurveyDesign( + weights="pw", + replicate_weights=_rep_cols(R), + replicate_method=method, + ) + + +@pytest.fixture(scope="module") +def base_panel(): + return _make_reversible_panel(n_groups=30, n_periods=5, seed=42) + + +@pytest.fixture +def replicate_design(): + """Helper constructing a SurveyDesign with replicate weights for a given method.""" + + def _build(df: pd.DataFrame, R: int, method: str, seed: int = 101) -> pd.DataFrame: + return _attach_replicate_weights(df, R=R, method=method, seed=seed) + + return _build + + +def _make_strictly_coarser_psu_panel( + n_groups_per_psu: int = 3, + n_psu: int = 4, + n_periods: int = 5, + seed: int = 17, +) -> pd.DataFrame: + """Panel where groups nest strictly inside PSUs. + + 12 groups across 4 PSUs (3 groups/PSU). Two switch cohorts so the + estimator has both joiners at different F_g. + """ + rng = np.random.default_rng(seed) + n_groups = n_groups_per_psu * n_psu + rows = [] + for g in range(n_groups): + psu = g // n_groups_per_psu + f = 2 if g < n_groups // 2 else 3 + for t in range(n_periods): + d = 1 if t >= f else 0 + y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3) + rows.append( + { + "group": g, + "period": t, + "treatment": d, + "outcome": y, + "pw": 1.0, + "psu": psu, + } + ) + return pd.DataFrame(rows) + + +# ── 1. Class A replicate (sites via _survey_se_from_group_if) ──────── + + +class TestReplicateClassA: + """Replicate variance wired through the inline branch in + `_survey_se_from_group_if` — inherited by overall DID_M, joiners + DID_+, leavers DID_-, multi-horizon DID_l, placebo DID^pl_l. + """ + + @pytest.mark.parametrize("method", REPLICATE_METHODS) + def test_overall_se_finite(self, base_panel, replicate_design, method): + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_att) + assert np.isfinite(res.overall_se) + assert res.overall_se > 0 + + @pytest.mark.parametrize("method", REPLICATE_METHODS) + def test_inference_fields_finite(self, base_panel, replicate_design, method): + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_t_stat) + assert np.isfinite(res.overall_p_value) + assert np.all(np.isfinite(res.overall_conf_int)) + + @pytest.mark.parametrize("method", ["BRR", "JK1"]) + def test_df_survey_reflects_n_valid(self, base_panel, replicate_design, method): + R = 24 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + # All replicates should succeed on a clean panel → df = n_valid - 1 = R - 1 + assert res.survey_metadata.df_survey == R - 1 + + def test_jk1_converges_to_tsl(self, replicate_design): + """JK1 replicate SE should be in the same order of magnitude as + analytical TSL SE on a large panel. Uses a loose 50% tolerance + because convergence depends on weight-construction details; the + stringent R-parity convergence test lives in + `test_survey_phase6.py`.""" + df = _make_reversible_panel(n_groups=60, n_periods=6, seed=42) + R = 30 + df_rep = replicate_design(df, R=R, method="JK1") + + sd_rep = _build_replicate_design(R, "JK1") + sd_tsl = SurveyDesign(weights="pw") + + res_rep = ChaisemartinDHaultfoeuille(seed=1).fit( + df_rep, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_rep, + ) + res_tsl = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_tsl, + ) + # Same point estimate (both unweighted) + assert res_rep.overall_att == pytest.approx(res_tsl.overall_att, rel=1e-6) + # SEs should be within 50% of each other (broad envelope) + ratio = res_rep.overall_se / res_tsl.overall_se + assert 0.5 <= ratio <= 2.0, ( + f"Replicate SE ({res_rep.overall_se:.4f}) and TSL SE " + f"({res_tsl.overall_se:.4f}) differ by more than 2x." + ) + + @pytest.mark.parametrize("method", ["BRR", "JK1"]) + def test_multi_horizon_under_replicate( + self, replicate_design, method + ): + """Multi-horizon DID_l inherits the Class A dispatch — horizon 1 + (always identifiable on this panel) should produce a finite + replicate-based SE.""" + df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42) + R = 20 + df = replicate_design(df, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + ) + info = res.event_study_effects[1] + assert np.isfinite(info["effect"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + + def test_placebo_under_replicate(self, replicate_design): + """Placebo DID^{pl}_l under replicate weights also carries finite SE.""" + df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42) + R = 20 + df = replicate_design(df, R=R, method="BRR") + sd = _build_replicate_design(R, "BRR") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + ) + if res.placebo_event_study: + info = res.placebo_event_study.get(1) + if info is not None and info.get("n_obs", 0) > 0: + assert np.isfinite(info["se"]) + + def test_did_x_replicate(self, base_panel, replicate_design): + """DID^X covariate adjustment flows through the Class A dispatch + (Y_mat residualization happens upstream of IF).""" + df = base_panel.copy() + rng = np.random.default_rng(123) + df["cov1"] = rng.normal(size=len(df)) + R = 20 + df = replicate_design(df, R=R, method="JK1") + sd = _build_replicate_design(R, "JK1") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + controls=["cov1"], + L_max=1, + ) + assert np.isfinite(res.overall_att) + assert np.isfinite(res.overall_se) and res.overall_se > 0 + + +# ── 2. Class B replicate (heterogeneity + twowayfeweights) ────────── + + +class TestReplicateClassB: + """Sites that call `compute_survey_if_variance` directly (or don't + route through the variance path at all) and need their own + replicate handling.""" + + @pytest.mark.parametrize("method", REPLICATE_METHODS) + def test_heterogeneity_se_finite(self, base_panel, replicate_design, method): + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + heterogeneity="x_het", + ) + assert res.heterogeneity_effects is not None + info = res.heterogeneity_effects[1] + assert np.isfinite(info["beta"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + + def test_twowayfeweights_accepts_replicate(self, base_panel, replicate_design): + """TWFE diagnostic produces the same beta_fe / sigma_fe / + fraction_negative under a replicate design as under a + pweight-only design (replicate weights only affect + `resolved.weights` cell aggregation).""" + R = 20 + df = replicate_design(base_panel, R=R, method="BRR") + sd_rep = _build_replicate_design(R, "BRR") + sd_plain = SurveyDesign(weights="pw") + + res_rep = twowayfeweights( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_rep, + ) + res_plain = twowayfeweights( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_plain, + ) + # Cell aggregation uses resolved.weights; replicate_weights are + # present but don't change the aggregated diagnostics. + assert res_rep.beta_fe == pytest.approx(res_plain.beta_fe, rel=1e-10) + assert res_rep.sigma_fe == pytest.approx(res_plain.sigma_fe, rel=1e-10) + assert res_rep.fraction_negative == pytest.approx( + res_plain.fraction_negative, rel=1e-10 + ) + + +# ── 3. PSU-level Hall-Mammen wild bootstrap ───────────────────────── + + +class TestPSUBootstrap: + + def test_auto_inject_bit_identical_to_group_level(self, base_panel): + """Under auto-inject psu=group the Hall-Mammen PSU bootstrap + collapses to the group-level multiplier bootstrap via the + identity-map fast path — producing bit-identical SE.""" + df = base_panel.copy() + sd_auto = SurveyDesign(weights="pw") + sd_explicit = SurveyDesign(weights="pw", psu="group") + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + r_auto = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_auto, + ) + r_explicit = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_explicit, + ) + assert r_auto.overall_se == pytest.approx(r_explicit.overall_se, rel=1e-10) + + def test_coarser_psu_produces_finite_se(self): + """Under a strictly coarser PSU, the Hall-Mammen PSU bootstrap + still produces a finite SE (the value depends on within-PSU + IF correlation).""" + df = _make_strictly_coarser_psu_panel() + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_att) + assert np.isfinite(res.overall_se) and res.overall_se > 0 + + def test_no_warning_under_auto_inject(self, base_panel): + """No UserWarning about PSU-level bootstrap should fire under + auto-inject psu=group (the warning only fires for strictly + coarser PSU).""" + sd = SurveyDesign(weights="pw") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit( + base_panel, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert not any( + "Hall-Mammen wild" in str(w.message) + or "group-level multiplier" in str(w.message) + for w in caught + ), ( + "Bootstrap-PSU warning should not fire under auto-inject psu=group, " + f"but got: {[str(w.message) for w in caught]}" + ) + + def test_warning_under_coarser_psu(self): + """When PSU is strictly coarser than group, the Hall-Mammen + warning fires.""" + df = _make_strictly_coarser_psu_panel() + sd = SurveyDesign(weights="pw", psu="psu") + with pytest.warns(UserWarning, match="Hall-Mammen wild"): + ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + @pytest.mark.parametrize("weight_type", ["rademacher", "mammen", "webb"]) + def test_all_weight_types_under_psu(self, weight_type): + """Rademacher / Mammen / Webb multipliers should all produce + finite SE under the PSU-level path.""" + df = _make_strictly_coarser_psu_panel() + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille( + n_bootstrap=200, seed=1, bootstrap_weights=weight_type + ).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_se) and res.overall_se > 0 + + def test_multi_horizon_psu_bootstrap(self): + """Event study bootstrap inherits PSU broadcasting.""" + df = _make_strictly_coarser_psu_panel(n_periods=6) + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=300, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + ) + info = res.event_study_effects[1] + assert np.isfinite(info["effect"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + + def test_sup_t_under_coarser_psu(self): + """Sup-t critical value is computable under coarser PSU.""" + df = _make_strictly_coarser_psu_panel(n_periods=6) + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=2, + ) + # sup_t_bands may be None if only one horizon is valid or if + # the cband critical value was not computed. + if res.sup_t_bands is not None: + assert res.sup_t_bands.get("cband_crit_value") is None or ( + res.sup_t_bands["cband_crit_value"] > 1.0 + ) + + +# ── 4. Invariants and cross-cutting contracts ──────────────────────── + + +class TestInvariants: + + def test_replicate_plus_bootstrap_rejected(self, base_panel, replicate_design): + """Replicate variance + n_bootstrap > 0 raises NotImplementedError.""" + R = 10 + df = replicate_design(base_panel, R=R, method="BRR") + sd = _build_replicate_design(R, "BRR") + with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"): + ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + def test_non_survey_unchanged(self, base_panel): + """Non-survey fits (survey_design=None) should produce bit- + identical SE to the pre-PSU-bootstrap behavior.""" + r1 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + base_panel, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + ) + r2 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + base_panel, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + ) + assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10) + + @pytest.mark.parametrize("method", ["BRR", "JK1"]) + def test_honest_did_under_replicate( + self, base_panel, replicate_design, method + ): + """HonestDiD bounds should flow through replicate SE and the + reduced df_survey (df = min(n_valid) - 1).""" + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + honest_did=True, + ) + assert res.survey_metadata.df_survey == R - 1 + if res.honest_did_results is not None: + # HonestDiD bounds exist and are finite; exact structure is + # honest_did.py internals. Just verify the attribute is + # populated so the SE flow-through is exercised. + assert res.honest_did_results is not None From 53dc755e6b0d1ae63f03fe1d90597b5705ee16f4 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 08:38:12 -0400 Subject: [PATCH 02/11] Harden PSU-map propagation + update fit() survey_design docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses local AI review findings: 1. P1: `_slice_psu_map` now enforces strict length equality between `target_size` and the full PSU map. Under the current bootstrap architecture all targets (overall DID_M, joiners DID_+, leavers DID_-, multi-horizon DID_l, placebo DID^pl_l) use the same variance-eligible group ordering from `_eligible_group_ids`, so the old prefix-slicing was a silent no-op. Making the check strict converts a fragile unstated invariant into a loud `ValueError` if a future refactor introduces a target whose group subset differs from the overall ordering (e.g., an aggregated target built from a non-prefix mask) — preventing a silent miscluster bug. 2. P2: `fit()` `survey_design` docstring updated to describe: - Replicate-weight variance support (BRR/Fay/JK1/JKn/SDR). - PSU-level Hall-Mammen wild bootstrap contract. - Replicate + `n_bootstrap > 0` rejection. 3. P2: `twowayfeweights()` docstring updated to document replicate acceptance. 4. P2: Added 3 regression tests in `tests/test_survey_dcdh_replicate_psu.py`: - `test_psu_map_size_mismatch_raises`: exercises the new strict equality check. - `test_generate_psu_or_group_weights_broadcast`: direct unit test of PSU-level broadcast — groups in the same PSU receive the same multiplier within a bootstrap replicate. - `test_generate_psu_or_group_weights_identity`: verifies the identity-map fast path is bit-identical to the non-PSU path. All 321 dCDH-related tests still pass. --- diff_diff/chaisemartin_dhaultfoeuille.py | 29 ++++++-- .../chaisemartin_dhaultfoeuille_bootstrap.py | 38 +++++++--- tests/test_survey_dcdh_replicate_psu.py | 74 +++++++++++++++++++ 3 files changed, 126 insertions(+), 15 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 9af4b6c0..d4905f30 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -604,11 +604,25 @@ def fit( ``drop_larger_lower=False`` to retain 2-switch groups. survey_design : SurveyDesign, optional Survey design specification for design-based inference. - Supports pweight with strata/PSU/FPC via Taylor Series - Linearization. Survey weights produce weighted cell means - for the point estimate; the IF-based variance accounts for - the survey design structure. Replicate weights are not yet - supported. + Supports ``weight_type='pweight'`` with two variance paths: + (1) Taylor Series Linearization using strata / PSU / FPC + (analytical) and (2) replicate-weight variance using + BRR / Fay / JK1 / JKn / SDR methods (analytical, + closed-form). Survey weights produce weighted cell means + for the point estimate. Under a survey design without an + explicit ``psu``, ``fit()`` auto-injects ``psu=`` + so the group is the effective sampling unit (strata / PSU + must be within-group-constant; mixed labels within a group + raise ``ValueError``). When ``n_bootstrap > 0`` and a + survey design is supplied, the multiplier bootstrap + operates at the PSU level (Hall-Mammen wild PSU + bootstrap) — under the default auto-inject this collapses + to a group-level clustered bootstrap. **Replicate weights + combined with ``n_bootstrap > 0`` are rejected** with + ``NotImplementedError`` (replicate variance is + closed-form; bootstrap would double-count variance). See + REGISTRY.md ``ChaisemartinDHaultfoeuille`` Notes for the + full contract. Returns ------- @@ -5412,6 +5426,11 @@ def twowayfeweights( (matching ``fit(..., survey_design=sd).twfe_*``). Required to preserve fit-vs-helper parity under survey-backed inputs. Only ``weight_type='pweight'`` is supported; other types raise ValueError. + Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are accepted — + the TWFE diagnostic has no SE field on ``TWFEWeightsResult``, + so replicate weights only affect the cell aggregation path + (aggregated numbers are identical to + ``fit(..., survey_design=sd).twfe_*`` under the same input). Returns ------- diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py index 06eec38a..a3e7a918 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py @@ -357,20 +357,38 @@ def _slice_psu_map( group_to_psu_map: Optional[np.ndarray], target_size: int, ) -> Optional[np.ndarray]: - """Slice a full-set group-to-PSU map to a target's contributing group count. - - Multi-horizon / joiner / leaver targets share the overall variance- - eligible group ordering but may have fewer contributing groups - (``u_centered.shape[0] <= n_groups_for_overall``). Slice the first - ``target_size`` entries — this mirrors the existing shared-weights - truncation at the multi-horizon bootstrap site. Returns ``None`` - when no map is provided (plain multiplier-bootstrap path). + """Return a PSU map aligned to a subset bootstrap target. + + The dCDH bootstrap passes a single full-length ``group_to_psu_map`` + built in ``fit()`` from ``_eligible_group_ids`` (the variance- + eligible group ordering). By construction all current bootstrap + targets (overall, joiners, leavers, multi-horizon DID_l, placebo + DID^pl_l) use that same ordering — each target's IF vector has the + same length as the map and the entries correspond to the same + groups in the same order. This invariant is enforced by requiring + ``target_size == len(group_to_psu_map)``; if a future refactor + introduces a target whose group subset differs from the overall + variance-eligible ordering, this assertion fires loudly rather + than silently misclustering the bootstrap draws. + + Returns ``None`` when no map is provided (plain multiplier- + bootstrap path — identity across targets). Raises ``ValueError`` + when the target size does not match the map length: callers must + supply a target-specific map (not a truncation) if they introduce + non-aligned subsets. """ if group_to_psu_map is None: return None - if target_size <= 0 or target_size > len(group_to_psu_map): + if target_size == len(group_to_psu_map): return group_to_psu_map - return group_to_psu_map[:target_size] + raise ValueError( + f"PSU map length ({len(group_to_psu_map)}) does not match " + f"bootstrap target size ({target_size}). dCDH's bootstrap contract " + f"requires all targets to use the same variance-eligible group " + f"ordering as `_eligible_group_ids`. If this target has a " + f"different ordering, construct a target-specific map keyed by " + f"its actual group IDs rather than truncating the full map." + ) def _generate_psu_or_group_weights( diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index aa1be454..1fa9409a 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -563,6 +563,80 @@ def test_non_survey_unchanged(self, base_panel): ) assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10) + def test_psu_map_size_mismatch_raises(self): + """`_slice_psu_map` enforces strict length equality to prevent + silent miscluster if a future bootstrap target uses a different + group ordering than `_eligible_group_ids`. Today all targets + align so slicing is a no-op — this guards the invariant.""" + from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import ( + _slice_psu_map, + ) + + full_map = np.array([0, 0, 1, 1, 2], dtype=np.int64) + # Exact length: no-op, returns the full map + assert np.array_equal(_slice_psu_map(full_map, 5), full_map) + # None passthrough + assert _slice_psu_map(None, 5) is None + # Mismatched length: loud failure + with pytest.raises(ValueError, match="PSU map length"): + _slice_psu_map(full_map, 3) + + def test_generate_psu_or_group_weights_broadcast(self): + """Direct unit test of the PSU-level weight generator: + groups mapped to the same PSU receive the same multiplier + within a single bootstrap replicate (Hall-Mammen wild PSU + contract).""" + from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import ( + _generate_psu_or_group_weights, + ) + + # 6 groups → 3 PSUs: groups (0,1), (2,3), (4,5) share multipliers. + group_to_psu = np.array([0, 0, 1, 1, 2, 2], dtype=np.int64) + rng = np.random.default_rng(0) + W = _generate_psu_or_group_weights( + n_bootstrap=50, + n_groups_target=6, + weight_type="rademacher", + rng=rng, + group_to_psu_map=group_to_psu, + ) + assert W.shape == (50, 6) + # Groups in the same PSU must receive the same multiplier + # within each bootstrap replicate. + assert np.array_equal(W[:, 0], W[:, 1]) + assert np.array_equal(W[:, 2], W[:, 3]) + assert np.array_equal(W[:, 4], W[:, 5]) + # Different PSUs should NOT all produce identical columns + # (given 50 replicates and Rademacher weights, collision + # probability is 2^-49 → effectively 0). + assert not np.array_equal(W[:, 0], W[:, 2]) + + def test_generate_psu_or_group_weights_identity(self): + """Identity map (each group its own PSU) uses the fast path and + produces group-level weights bit-identical to the non-PSU path.""" + from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import ( + _generate_psu_or_group_weights, + ) + + identity_map = np.arange(6, dtype=np.int64) + rng1 = np.random.default_rng(0) + W_identity = _generate_psu_or_group_weights( + n_bootstrap=20, + n_groups_target=6, + weight_type="mammen", + rng=rng1, + group_to_psu_map=identity_map, + ) + rng2 = np.random.default_rng(0) + W_plain = _generate_psu_or_group_weights( + n_bootstrap=20, + n_groups_target=6, + weight_type="mammen", + rng=rng2, + group_to_psu_map=None, + ) + assert np.array_equal(W_identity, W_plain) + @pytest.mark.parametrize("method", ["BRR", "JK1"]) def test_honest_did_under_replicate( self, base_panel, replicate_design, method From 2f8ce4c8630782c45dd2d2ef8d10dfb219bcee46 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 08:44:27 -0400 Subject: [PATCH 03/11] Build PSU map from group IDs, not shared positional array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the AI review R2 finding that the prior shared-map + length-assertion approach still relied on an unstated invariant (all bootstrap targets use the variance-eligible group ordering). Changes: - `_compute_dcdh_bootstrap` now accepts `group_id_to_psu_code: Dict[Any, int]` and `eligible_group_ids: np.ndarray` instead of a pre-built flat `group_to_psu_map` array. Every target call constructs its PSU map via `_map_for_target(target_size, group_id_to_psu_code, eligible_group_ids)` — a dict lookup per group ID, not a positional reuse. The length-assertion is retained for the size-mismatch case and now sits alongside a KeyError-based check for missing group IDs. - `_slice_psu_map` removed; `_map_for_target` replaces it. The explicit per-target construction removes the hidden semantic assumption that was previously enforced only by cardinality. - `fit()` builds the dict from `_eligible_group_ids` + the dense PSU codes. The per-target maps are all identical today (because every current target uses `_eligible_group_ids` ordering), but that is now a consequence of the caller's choice rather than a brittle assumption buried in the mixin. - Test `test_psu_map_size_mismatch_raises` replaced by `test_map_for_target_id_lookup`, which exercises the ID-lookup semantics including: - Correct map construction for a given eligible-group ordering. - Non-prefix reordering produces a different map (proving the ordering-from-IDs contract). - Length mismatch → ValueError. - Missing-group → ValueError. Full regression: 401 passing (previously 321). --- diff_diff/chaisemartin_dhaultfoeuille.py | 39 +++--- .../chaisemartin_dhaultfoeuille_bootstrap.py | 117 ++++++++++++------ tests/test_survey_dcdh_replicate_psu.py | 49 +++++--- 3 files changed, 134 insertions(+), 71 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index d4905f30..54890d35 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -2114,16 +2114,17 @@ def fit( h_data["did_l"], ) - # Under a survey design with PSU information, build a dense - # group_to_psu_map so the bootstrap randomizes at the PSU - # level rather than at the group level (Hall-Mammen wild PSU - # bootstrap). PSU/strata are within-group-constant by the - # _validate_group_constant_strata_psu precondition, so each - # variance-eligible group has exactly one PSU label. Under - # auto-inject psu=group the map is the identity and the - # bootstrap mixin's fast path reproduces the pre-PSU - # behavior bit-for-bit. - group_to_psu_map_bootstrap: Optional[np.ndarray] = None + # Under a survey design with PSU information, build a + # `group_id_to_psu_code` dict so the bootstrap mixin can + # derive each target's PSU map by ID lookup rather than by + # positional reuse. PSU/strata are within-group-constant by + # the _validate_group_constant_strata_psu precondition, so + # each variance-eligible group has exactly one PSU label. + # Under auto-inject psu=group each group has a unique PSU + # code and the bootstrap mixin's identity-map fast path + # reproduces the pre-PSU behavior bit-for-bit. + group_id_to_psu_code_bootstrap: Optional[Dict[Any, int]] = None + eligible_group_ids_bootstrap: Optional[np.ndarray] = None if ( resolved_survey is not None and getattr(resolved_survey, "psu", None) is not None @@ -2144,17 +2145,20 @@ def fit( break labels = obs_psu_codes[mask_g] # Within-group-constant PSU is validated upstream; - # .item() on the first is robust to integer vs string. + # the first label represents the whole group. group_psu_labels.append(labels[0]) if valid_map and len(group_psu_labels) == n_groups_for_overall_var: - # Dense integer indices for _generate_psu_or_group_weights. - _, group_to_psu_map_bootstrap = np.unique( + # Factor PSU labels to dense integer codes. + _, dense_codes = np.unique( np.asarray(group_psu_labels), return_inverse=True, ) - group_to_psu_map_bootstrap = np.asarray( - group_to_psu_map_bootstrap, dtype=np.int64 - ) + dense_codes = np.asarray(dense_codes, dtype=np.int64) + group_id_to_psu_code_bootstrap = { + gid: int(code) + for gid, code in zip(_eligible_group_ids, dense_codes) + } + eligible_group_ids_bootstrap = np.asarray(_eligible_group_ids) br = self._compute_dcdh_bootstrap( n_groups_for_overall=n_groups_for_overall_var, @@ -2166,7 +2170,8 @@ def fit( placebo_inputs=placebo_inputs, multi_horizon_inputs=mh_boot_inputs, placebo_horizon_inputs=pl_boot_inputs, - group_to_psu_map=group_to_psu_map_bootstrap, + group_id_to_psu_code=group_id_to_psu_code_bootstrap, + eligible_group_ids=eligible_group_ids_bootstrap, ) bootstrap_results = br diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py index a3e7a918..a85ae446 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py @@ -19,7 +19,7 @@ produce a bootstrap distribution per target. """ -from typing import TYPE_CHECKING, Dict, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple import numpy as np @@ -71,7 +71,8 @@ def _compute_dcdh_bootstrap( multi_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None, placebo_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None, # --- Survey: PSU-level bootstrap under survey designs --- - group_to_psu_map: Optional[np.ndarray] = None, + group_id_to_psu_code: Optional[Dict[Any, int]] = None, + eligible_group_ids: Optional[np.ndarray] = None, ) -> DCDHBootstrapResults: """ Compute multiplier-bootstrap inference for all dCDH targets. @@ -162,6 +163,14 @@ def _compute_dcdh_bootstrap( ) rng = np.random.default_rng(self.seed) + # PSU label for each bootstrap weight column is derived from + # the group's ID via `_map_for_target`, not from positional + # truncation. All current dCDH bootstrap targets use the + # variance-eligible group ordering (`eligible_group_ids`); if a + # future target uses a different ordering, add a dedicated + # group-IDs parameter for it rather than reusing the overall + # eligible list. + # --- Overall DID_M --- # Skip the scalar DID_M bootstrap when divisor_overall <= 0 # (e.g., pure non-binary panels where N_S=0), but continue @@ -177,7 +186,11 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH overall DID_M bootstrap", return_distribution=True, - group_to_psu_map=group_to_psu_map, + group_to_psu_map=_map_for_target( + u_centered_overall.shape[0], + group_id_to_psu_code, + eligible_group_ids, + ), ) else: overall_se = np.nan @@ -209,7 +222,9 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH joiners DID_+ bootstrap", return_distribution=False, - group_to_psu_map=_slice_psu_map(group_to_psu_map, u_j.size), + group_to_psu_map=_map_for_target( + u_j.size, group_id_to_psu_code, eligible_group_ids, + ), ) results.joiners_se = se_j results.joiners_ci = ci_j @@ -229,7 +244,9 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH leavers DID_- bootstrap", return_distribution=False, - group_to_psu_map=_slice_psu_map(group_to_psu_map, u_l.size), + group_to_psu_map=_map_for_target( + u_l.size, group_id_to_psu_code, eligible_group_ids, + ), ) results.leavers_se = se_l results.leavers_ci = ci_l @@ -249,7 +266,9 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH placebo DID_M^pl bootstrap", return_distribution=False, - group_to_psu_map=_slice_psu_map(group_to_psu_map, u_pl.size), + group_to_psu_map=_map_for_target( + u_pl.size, group_id_to_psu_code, eligible_group_ids, + ), ) results.placebo_se = se_pl results.placebo_ci = ci_pl @@ -276,7 +295,9 @@ def _compute_dcdh_bootstrap( n_groups_target=n_groups_mh, weight_type=self.bootstrap_weights, rng=rng, - group_to_psu_map=group_to_psu_map, + group_to_psu_map=_map_for_target( + n_groups_mh, group_id_to_psu_code, eligible_group_ids, + ), ) for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()): @@ -335,7 +356,9 @@ def _compute_dcdh_bootstrap( rng=rng, context=f"dCDH placebo l={l_h} bootstrap", return_distribution=False, - group_to_psu_map=_slice_psu_map(group_to_psu_map, u_h.size), + group_to_psu_map=_map_for_target( + u_h.size, group_id_to_psu_code, eligible_group_ids, + ), ) pl_ses[l_h] = se_h pl_cis[l_h] = ci_h @@ -353,42 +376,56 @@ def _compute_dcdh_bootstrap( # ============================================================================= -def _slice_psu_map( - group_to_psu_map: Optional[np.ndarray], +def _map_for_target( target_size: int, + group_id_to_psu_code: Optional[Dict[Any, int]], + eligible_group_ids: Optional[np.ndarray], ) -> Optional[np.ndarray]: - """Return a PSU map aligned to a subset bootstrap target. - - The dCDH bootstrap passes a single full-length ``group_to_psu_map`` - built in ``fit()`` from ``_eligible_group_ids`` (the variance- - eligible group ordering). By construction all current bootstrap - targets (overall, joiners, leavers, multi-horizon DID_l, placebo - DID^pl_l) use that same ordering — each target's IF vector has the - same length as the map and the entries correspond to the same - groups in the same order. This invariant is enforced by requiring - ``target_size == len(group_to_psu_map)``; if a future refactor - introduces a target whose group subset differs from the overall - variance-eligible ordering, this assertion fires loudly rather - than silently misclustering the bootstrap draws. - - Returns ``None`` when no map is provided (plain multiplier- - bootstrap path — identity across targets). Raises ``ValueError`` - when the target size does not match the map length: callers must - supply a target-specific map (not a truncation) if they introduce - non-aligned subsets. + """Build a PSU map for a bootstrap target from IDs (not positions). + + The caller passes: + - ``group_id_to_psu_code``: a dict mapping each variance-eligible + group ID to its dense PSU code (built once in ``fit()``). + - ``eligible_group_ids``: the ordered list of group IDs that + correspond to the current target's ``u_centered`` vector. + + Returns an integer array of length ``target_size`` where entry + ``i`` is the PSU code for the ``i``-th contributing group. + + Returns ``None`` when no PSU information is available (plain + multiplier-bootstrap path — identity across targets). + + Raises ``ValueError`` if ``target_size`` does not match + ``len(eligible_group_ids)``: every current dCDH bootstrap target + uses the variance-eligible group ordering, so any size mismatch + signals that a caller introduced a target whose group subset + diverges and should pass its own ``target_group_ids`` rather than + reusing the overall eligible list. Also raises ``ValueError`` if + any group ID is missing from the dict (signaling misalignment + between the target's IF vector and the map's keys). """ - if group_to_psu_map is None: + if group_id_to_psu_code is None or eligible_group_ids is None: return None - if target_size == len(group_to_psu_map): - return group_to_psu_map - raise ValueError( - f"PSU map length ({len(group_to_psu_map)}) does not match " - f"bootstrap target size ({target_size}). dCDH's bootstrap contract " - f"requires all targets to use the same variance-eligible group " - f"ordering as `_eligible_group_ids`. If this target has a " - f"different ordering, construct a target-specific map keyed by " - f"its actual group IDs rather than truncating the full map." - ) + if target_size != len(eligible_group_ids): + raise ValueError( + f"Bootstrap target size ({target_size}) does not match " + f"eligible_group_ids length ({len(eligible_group_ids)}). " + "dCDH's bootstrap contract requires all current targets to " + "use the variance-eligible group ordering; if a new target " + "has a different ordering, pass target-specific group IDs " + "to _map_for_target rather than reusing eligible_group_ids." + ) + try: + return np.array( + [group_id_to_psu_code[gid] for gid in eligible_group_ids], + dtype=np.int64, + ) + except KeyError as e: + raise ValueError( + f"Group ID {e.args[0]!r} in eligible_group_ids has no entry " + f"in group_id_to_psu_code — PSU map is misaligned with the " + f"bootstrap target's group set." + ) from e def _generate_psu_or_group_weights( diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index 1fa9409a..836583d2 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -563,23 +563,44 @@ def test_non_survey_unchanged(self, base_panel): ) assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10) - def test_psu_map_size_mismatch_raises(self): - """`_slice_psu_map` enforces strict length equality to prevent - silent miscluster if a future bootstrap target uses a different - group ordering than `_eligible_group_ids`. Today all targets - align so slicing is a no-op — this guards the invariant.""" + def test_map_for_target_id_lookup(self): + """`_map_for_target` builds the PSU map from group IDs via dict + lookup, not positional reuse. All current dCDH bootstrap + targets use the variance-eligible group ordering, so the + helper looks up each target group's PSU via + `group_id_to_psu_code`. Length mismatch → ValueError (loud + failure rather than silent miscluster).""" from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import ( - _slice_psu_map, + _map_for_target, ) - full_map = np.array([0, 0, 1, 1, 2], dtype=np.int64) - # Exact length: no-op, returns the full map - assert np.array_equal(_slice_psu_map(full_map, 5), full_map) - # None passthrough - assert _slice_psu_map(None, 5) is None - # Mismatched length: loud failure - with pytest.raises(ValueError, match="PSU map length"): - _slice_psu_map(full_map, 3) + gid_to_psu = {"a": 0, "b": 0, "c": 1, "d": 1, "e": 2} + eligible = np.asarray(["a", "b", "c", "d", "e"]) + expected = np.array([0, 0, 1, 1, 2], dtype=np.int64) + built = _map_for_target(5, gid_to_psu, eligible) + assert built is not None + assert np.array_equal(built, expected) + + # None passthrough when no PSU info + assert _map_for_target(5, None, None) is None + + # Length mismatch → loud failure + with pytest.raises(ValueError, match="target size"): + _map_for_target(3, gid_to_psu, eligible) + + # Missing group ID → loud failure + gid_to_psu_incomplete = {"a": 0, "b": 0, "c": 1, "d": 1} + with pytest.raises(ValueError, match="has no entry"): + _map_for_target(5, gid_to_psu_incomplete, eligible) + + # Non-prefix reordering: different ordered group IDs produce a + # different PSU map even if the dict is the same. This is the + # key invariant the previous prefix-slicing lacked. + eligible_reordered = np.asarray(["c", "a", "d", "e", "b"]) + built_reordered = _map_for_target(5, gid_to_psu, eligible_reordered) + assert built_reordered is not None + expected_reordered = np.array([1, 0, 1, 2, 0], dtype=np.int64) + assert np.array_equal(built_reordered, expected_reordered) def test_generate_psu_or_group_weights_broadcast(self): """Direct unit test of the PSU-level weight generator: From 84c9895b58a367b06187ddfb0ffa5a09ec514294 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 08:46:57 -0400 Subject: [PATCH 04/11] Enforce explicit multi-horizon alignment in shared-draw bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AI review R3 flagged the remaining positional truncation in the multi-horizon shared-draw path: w_h = shared_weights[:, : u_h.size] Under the current contract every horizon's IF vector uses the full variance-eligible group ordering, so `u_h.size == n_groups_mh` always holds and the slice is a no-op. But the truncation itself is a hidden positional assumption — if a future refactor introduced horizon- specific masking, the shared-weight alignment would break silently. Replace the truncation with: 1. An explicit assertion `u_h.size == n_groups_mh` that raises with a message pointing future refactors at the right fix (threading target-specific group IDs through `multi_horizon_inputs`). 2. Direct reuse of the full `shared_weights` matrix — removes the slice semantic entirely. Add `TestPSUBootstrap.test_multi_horizon_shared_draw_under_coarser_psu` covering an end-to-end L_max=2 + coarser PSU case that exercises the shared-draw path + assertion. Full regression: 312 passing. --- .../chaisemartin_dhaultfoeuille_bootstrap.py | 25 ++++++++++++++++-- tests/test_survey_dcdh_replicate_psu.py | 26 +++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py index a85ae446..a456890e 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py @@ -302,8 +302,29 @@ def _compute_dcdh_bootstrap( for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()): if u_h.size > 0 and n_h > 0: - # Use the shared weight matrix truncated to u_h length - w_h = shared_weights[:, : u_h.size] + # Under the current contract every horizon's IF + # vector uses the variance-eligible group ordering + # from `eligible_group_ids`, so the shared weight + # matrix is already at the right shape. Assert + # this invariant so any future refactor that + # introduces horizon-specific masking fails loudly + # rather than silently misaligning PSU clusters via + # positional truncation. + if u_h.size != n_groups_mh: + raise ValueError( + f"Multi-horizon bootstrap: horizon {l_h} " + f"IF vector has {u_h.size} entries but " + f"shared weight matrix has {n_groups_mh} " + f"columns. dCDH's contract requires every " + f"horizon to use the variance-eligible " + f"group ordering; to support a horizon " + f"with a different ordering, thread " + f"target-specific group IDs through " + f"`multi_horizon_inputs` and project the " + f"shared PSU draws onto the horizon's own " + f"ordering via `_map_for_target`." + ) + w_h = shared_weights deviations = (w_h @ u_h) / n_h dist_h = deviations + eff_h diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index 836583d2..3838dd16 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -501,6 +501,32 @@ def test_multi_horizon_psu_bootstrap(self): assert np.isfinite(info["effect"]) assert np.isfinite(info["se"]) and info["se"] > 0 + def test_multi_horizon_shared_draw_under_coarser_psu(self): + """End-to-end test of the shared-draw multi-horizon bootstrap + under a strictly coarser PSU with L_max >= 2. Exercises the + per-horizon invariant assertion + sup-t band computation + through the Hall-Mammen wild PSU path.""" + df = _make_strictly_coarser_psu_panel(n_periods=7) + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=2, + ) + # Both horizons should produce finite bootstrap SEs via the + # shared-draw path projected through the PSU map. + for lag in (1, 2): + info = res.event_study_effects[lag] + if info["n_obs"] > 0: + assert np.isfinite(info["effect"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + def test_sup_t_under_coarser_psu(self): """Sup-t critical value is computable under coarser PSU.""" df = _make_strictly_coarser_psu_panel(n_periods=6) From 13d5e41f646d39ef2ad9b6c58c3a9542977f8ef9 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 09:24:30 -0400 Subject: [PATCH 05/11] Fix CI review R1: reduce (not replace) design df under replicate variance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses R1 P1 from PR #311 AI review. The R1 reviewer flagged that `_effective_df_survey()` was overwriting the design-level `resolved_survey.df_survey` with `min(n_valid) - 1` whenever any replicate-aware IF site ran. For replicate designs with rank deficiency, `ResolvedSurveyDesign.df_survey` returns `QR-rank - 1` (per R's `survey::degf()` convention at `diff_diff/survey.py:590`), which can be strictly smaller than `R - 1`. The old behavior produced anti-conservative t-critical values on the top-level dCDH surface. Additionally, the final effective df was never persisted into `survey_metadata.df_survey`, so HonestDiD (which reads from `results.survey_metadata.df_survey` at `honest_did.py:973`) could use a different df than the main dCDH surface. Changes: - `_effective_df_survey()` now returns `min(resolved_survey.df_survey, min(n_valid) - 1)` when both are defined; returns None if base df is undefined (rank ≤ 1) or reduced df is degenerate (< 1). - In `fit()`, immediately before constructing `ChaisemartinDHaultfoeuilleResults`, mutate `survey_metadata.df_survey` to the final effective df (post-all- IF-sites, including heterogeneity). `SurveyMetadata` is a mutable `@dataclass` so direct assignment is safe. - `_compute_heterogeneity_test()` docstring updated to describe the replicate-weight dispatch and effective-df rule (R1 P3). Regression tests in `TestInvariants`: - `test_rank_deficient_replicate_uses_design_df`: duplicated replicate columns → `design_df < R - 1` → persisted `survey_metadata.df_survey` must equal `design_df` (not `R - 1`). - `test_dropped_replicate_reduces_df`: zeroed replicate columns produce rank deficiency in the design → persisted df must be bounded by `design_df`. Full regression: 324 passing. --- diff_diff/chaisemartin_dhaultfoeuille.py | 63 ++++++++++++++--- tests/test_survey_dcdh_replicate_psu.py | 87 ++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 11 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 54890d35..d72f543a 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -2683,6 +2683,22 @@ def fit( effective_joiners_available = False effective_leavers_available = False + # Persist the final effective df_survey (post-heterogeneity, + # post-all-IF-sites) into survey_metadata so downstream + # consumers — HonestDiD bounds (honest_did.py:973 reads + # results.survey_metadata.df_survey), exported metadata, and + # users — all see the same df that the top-level dCDH inference + # actually used. Before this assignment, survey_metadata carries + # the design-level `resolved_survey.df_survey` from survey + # resolution; under replicate designs the effective df may be + # smaller (reduced via _replicate_n_valid_list). SurveyMetadata + # is a mutable @dataclass (diff_diff/survey.py:681), so direct + # attribute assignment is safe. + if survey_metadata is not None: + survey_metadata.df_survey = _effective_df_survey( + resolved_survey, _replicate_n_valid_list + ) + results = ChaisemartinDHaultfoeuilleResults( overall_att=effective_overall_att, overall_se=effective_overall_se, @@ -3439,8 +3455,21 @@ def _compute_heterogeneity_test( Observation-level survey info with keys ``group_ids`` (raw per-row group labels), ``weights`` (per-row survey weights), and ``resolved`` (ResolvedSurveyDesign). When provided, the regression uses WLS with - per-group weights W_g = sum of obs survey weights, and SE is computed - via Binder TSL IF expansion through ``compute_survey_if_variance``. + per-group weights W_g = sum of obs survey weights. SE is computed + via Binder TSL IF expansion through ``compute_survey_if_variance`` + by default; under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), + dispatches to ``compute_replicate_if_variance`` for Rao-Wu-style + variance. The effective df for t-critical values follows the + site-level ``min(df_s, n_valid_het - 1)`` rule and the helper + mutates ``replicate_n_valid_list`` so the final + ``_effective_df_survey(...)`` sees this site's n_valid. + replicate_n_valid_list : list[int], optional + Shared accumulator for replicate-weight ``n_valid`` counts across + IF sites. When provided and a replicate design is in use, this + function appends its own ``n_valid_het`` before computing the + local effective df — so both the local inference fields and the + final ``survey_metadata.df_survey`` (set by ``fit()``) reflect + this site's contribution. Returns ------- @@ -4985,18 +5014,30 @@ def _effective_df_survey( Under replicate-weight designs, each IF site returns an ``n_valid`` count (number of replicate columns that produced a finite estimate). - This helper returns ``min(n_valid) - 1`` across sites, matching the - precedent in ``diff_diff/efficient_did.py:1133-1135`` and - ``diff_diff/triple_diff.py:676-686``. Under TSL (analytical) or - non-survey fits, ``replicate_n_valid_list`` is empty and this falls - through to ``resolved_survey.df_survey`` (or ``None`` when there's - no survey design at all). + This helper **reduces** — never overwrites — the design-level + ``resolved_survey.df_survey`` (which for replicate designs is + ``QR-rank - 1`` per R's ``survey::degf()`` convention; see + ``diff_diff/survey.py:590``). Returns + ``min(resolved_survey.df_survey, min(n_valid) - 1)`` when both are + defined. + + Matches the precedent in ``diff_diff/efficient_did.py:1133-1135`` + and ``diff_diff/triple_diff.py:676-686`` (both reduce, not + replace). Under TSL (analytical) or non-survey fits, + ``replicate_n_valid_list`` is empty and this falls through to the + design-level df. Returns ``None`` when either the base df is + undefined (rank ≤ 1) or the reduced df would be < 1 — preserving + the NaN-inference contract from ``safe_inference``. """ if resolved_survey is None: return None - if replicate_n_valid_list: - return int(min(replicate_n_valid_list)) - 1 - return resolved_survey.df_survey + base_df = resolved_survey.df_survey + if not replicate_n_valid_list: + return base_df + reduced_df = int(min(replicate_n_valid_list)) - 1 + if base_df is None or reduced_df < 1: + return None + return min(int(base_df), reduced_df) def _compute_se( diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index 3838dd16..c07d2cd9 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -711,3 +711,90 @@ def test_honest_did_under_replicate( # honest_did.py internals. Just verify the attribute is # populated so the SE flow-through is exercised. assert res.honest_did_results is not None + + def test_rank_deficient_replicate_uses_design_df(self, base_panel): + """Duplicated replicate columns → QR-rank < R → design df < R-1. + + Regression for PR #311 CI review R1 P1. The main dCDH surface, + ``survey_metadata.df_survey``, and HonestDiD must all use the + reduced effective df (``min(design_df, n_valid - 1)``) rather + than the naive ``n_valid - 1`` that ignored the design's QR + rank. Before the fix, ``survey_metadata.df_survey`` stayed at + ``R - 1`` for rank-deficient replicate designs — which is + anti-conservative for t-inference. + """ + R = 12 + df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3) + # Force duplicate columns to induce rank deficiency: + # rep1 = rep0, rep3 = rep2 → true rank = R - 2. + df["rep1"] = df["rep0"] + df["rep3"] = df["rep2"] + sd = _build_replicate_design(R, "BRR") + # Read off the design-level df via the resolved design so the + # test is robust to internal QR tolerance changes. + resolved = sd.resolve(df) + design_df = resolved.df_survey + assert design_df is not None and design_df < R - 1, ( + f"Expected rank deficiency: design df={design_df} " + f"should be < {R - 1}" + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, honest_did=True, + ) + # survey_metadata carries the final (reduced) df — not R - 1. + assert res.survey_metadata.df_survey == design_df, ( + f"Expected survey_metadata.df_survey == design_df " + f"({design_df}), got {res.survey_metadata.df_survey}." + ) + # HonestDiD reads results.survey_metadata.df_survey directly + # (honest_did.py:973), so the same reduced df flows through. + # We assert the HonestDiD result is populated (df flow-through + # is exercised by the honest_did=True call above). + assert res.honest_did_results is not None + + def test_dropped_replicate_reduces_df(self, base_panel): + """When some replicate columns produce degenerate per-replicate + estimates (e.g., all-zero weight vectors), the design's QR rank + drops below R — which flows through to the effective df. + + Regression for PR #311 CI review R1 P2. Even when every IF site + reports the full ``n_valid = R``, the persisted df must never + exceed the design-level df. Pre-fix, ``_effective_df_survey`` + returned ``n_valid - 1 = R - 1`` which over-counted degrees of + freedom on a reduced-rank design. + """ + R = 15 + df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=4) + # Zero out two replicate columns → drops rank by 2. + df["rep0"] = 0.0 + df["rep1"] = 0.0 + sd = _build_replicate_design(R, "JK1") + resolved = sd.resolve(df) + design_df = resolved.df_survey + assert design_df is not None and design_df <= R - 3, ( + f"Expected zero-column rank deficiency: design df={design_df} " + f"should be <= {R - 3}" + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + # Persisted df must be capped by the design df, not R - 1. + assert res.survey_metadata.df_survey is not None + assert res.survey_metadata.df_survey <= design_df, ( + f"Expected survey_metadata.df_survey <= design_df " + f"({design_df}), got {res.survey_metadata.df_survey}. " + f"Effective df must NEVER exceed the design-level df under " + f"replicate variance." + ) + assert res.survey_metadata.df_survey < R - 1, ( + f"Expected persisted df to reflect reduced rank (< R - 1 = " + f"{R - 1}), got {res.survey_metadata.df_survey}." + ) From 7e62d4cf479d3c4aede4dc595791d2b29fa7b8b1 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 10:09:49 -0400 Subject: [PATCH 06/11] Fix CI review R2: undefined-df NaN inference + cross-surface df consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses R2 P0 + P1 from PR #311 AI review. R2 P0: `_effective_df_survey` returns None when replicate design has undefined base df (QR-rank ≤ 1), but `safe_inference(df=None)` uses z-inference — NOT NaN. So a rank-1 replicate design with finite SE silently produced valid-looking p-values/CIs where REGISTRY.md mandates NaN. R2 P1: `_compute_heterogeneity_test` re-derived its local df_s from `min(replicate_n_valid_list) - 1` directly — bypassing the base-df cap the rest of dCDH uses. When df_s was None, the in-loop fallback promoted it to `n_valid_het - 1`, which rescued a rank-deficient design in heterogeneity inference that the main surface correctly treated as NaN. Separately, heterogeneity appended its own n_valid AFTER main inference had frozen, leaving `overall_t`, `event_study _effects`, etc. computed with a larger-than-final df while HonestDiD (reading the post-heterogeneity `survey_metadata.df_survey`) used the smaller final df. Changes: - Add `_inference_df(effective_df, resolved_survey)` helper next to `_effective_df_survey`: coerces None to 0 when replicate (triggers safe_inference's NaN branch), else returns effective_df as-is. - Wrap every dCDH `safe_inference(df=...)` call site (~14 sites) with `_inference_df(df, resolved_survey)`. - In `_compute_heterogeneity_test`, derive `df_s` via `_effective_df_survey(resolved, list(replicate_n_valid_list or []))` instead of the ad-hoc `min - 1`. When `df_s is None` (undefined base df), keep `df_s_local = None` — never promote. - Add a post-heterogeneity finalization block in `fit()` that re-runs `safe_inference` for every public surface (overall, joiners, leavers, multi-horizon, placebo-horizon, heterogeneity) with the FINAL effective df. This guarantees every surface and `survey_metadata.df_survey` use the same df. - Update REGISTRY.md heterogeneity Note to document replicate-weight dispatch and the effective-df rule. Regression tests (TestInvariants): - `test_rank_1_replicate_forces_nan_inference`: identical replicate columns → rank-1 design → design df None → all inference fields NaN. - `test_heterogeneity_replicate_cross_surface_df_consistency`: rank-deficient replicate design with `heterogeneity=` active — asserts `overall_p_value` matches `t(final_df)` distribution and heterogeneity fields agree. Full regression: 326 passing. --- diff_diff/chaisemartin_dhaultfoeuille.py | 184 +++++++++++++++++------ docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh_replicate_psu.py | 105 +++++++++++++ 3 files changed, 248 insertions(+), 43 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index d72f543a..d4a6fb15 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -1627,7 +1627,7 @@ def fit( did_l_val = multi_horizon_dids[l_h]["did_l"] _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) - t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_df_s) + t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_inference_df(_df_s, resolved_survey)) multi_horizon_inference[l_h] = { "effect": did_l_val, "se": se_l, @@ -1756,7 +1756,8 @@ def fit( pl_val = pl_data["placebo_l"] _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) t_pl_l, p_pl_l, ci_pl_l = safe_inference( - pl_val, se_pl_l, alpha=self.alpha, df=_df_s + pl_val, se_pl_l, alpha=self.alpha, + df=_inference_df(_df_s, resolved_survey), ) placebo_horizon_inference[lag_l] = { "effect": pl_val, @@ -1863,7 +1864,8 @@ def fit( ) _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) overall_t, overall_p, overall_ci = safe_inference( - overall_att, overall_se, alpha=self.alpha, df=_df_survey + overall_att, overall_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) # Joiners SE (uses joiner-only centered IF; conservative bound) @@ -1878,7 +1880,8 @@ def fit( _replicate_n_valid_list.append(n_valid_joiners) _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) joiners_t, joiners_p, joiners_ci = safe_inference( - joiners_att, joiners_se, alpha=self.alpha, df=_df_survey + joiners_att, joiners_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) else: joiners_se, joiners_t, joiners_p, joiners_ci = ( @@ -1900,7 +1903,8 @@ def fit( _replicate_n_valid_list.append(n_valid_leavers) _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) leavers_t, leavers_p, leavers_ci = safe_inference( - leavers_att, leavers_se, alpha=self.alpha, df=_df_survey + leavers_att, leavers_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) else: leavers_se, leavers_t, leavers_p, leavers_ci = ( @@ -2200,17 +2204,17 @@ def fit( overall_se = br.overall_se overall_p = br.overall_p_value if br.overall_p_value is not None else np.nan overall_ci = br.overall_ci if br.overall_ci is not None else (np.nan, np.nan) - overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_df_survey)[0] + overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] if joiners_available and br.joiners_se is not None and np.isfinite(br.joiners_se): joiners_se = br.joiners_se joiners_p = br.joiners_p_value if br.joiners_p_value is not None else np.nan joiners_ci = br.joiners_ci if br.joiners_ci is not None else (np.nan, np.nan) - joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_df_survey)[0] + joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] if leavers_available and br.leavers_se is not None and np.isfinite(br.leavers_se): leavers_se = br.leavers_se leavers_p = br.leavers_p_value if br.leavers_p_value is not None else np.nan leavers_ci = br.leavers_ci if br.leavers_ci is not None else (np.nan, np.nan) - leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_df_survey)[0] + leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] # ------------------------------------------------------------------ # Step 20: Build the results dataclass @@ -2372,7 +2376,8 @@ def fit( if np.isfinite(delta_se): effective_overall_se = delta_se effective_overall_t, effective_overall_p, effective_overall_ci = safe_inference( - delta_val, delta_se, alpha=self.alpha, df=_df_survey + delta_val, delta_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) else: effective_overall_se = float("nan") @@ -2406,7 +2411,8 @@ def fit( # Fallback: NaN SE (Phase 1 path or missing IF) pl_se = float("nan") pl_t, pl_p, pl_ci = safe_inference( - pl_data["placebo_l"], pl_se, alpha=self.alpha, df=_df_survey + pl_data["placebo_l"], pl_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) placebo_event_study_dict[-lag_l] = { "effect": pl_data["placebo_l"], @@ -2457,7 +2463,8 @@ def fit( bs_ci if bs_ci is not None else (np.nan, np.nan) ) placebo_event_study_dict[neg_key]["t_stat"] = safe_inference( - eff, bs_se, alpha=self.alpha, df=_df_survey + eff, bs_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), )[0] # Phase 2: build normalized_effects with SE @@ -2470,7 +2477,7 @@ def fit( # SE via delta method: SE(DID^n_l) = SE(DID_l) / delta^D_l se_did_l = multi_horizon_se.get(l_h, float("nan")) se_norm = se_did_l / denom if np.isfinite(denom) and denom > 0 else float("nan") - t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_df_survey) + t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey)) normalized_effects_out[l_h] = { "effect": eff, "se": se_norm, @@ -2529,7 +2536,8 @@ def fit( else: running_se_ub = float("nan") cum_t, cum_p, cum_ci = safe_inference( - cum_effect, running_se_ub, alpha=self.alpha, df=_df_survey + cum_effect, running_se_ub, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) cumulated[l_h] = { "effect": cum_effect, @@ -2683,21 +2691,73 @@ def fit( effective_joiners_available = False effective_leavers_available = False - # Persist the final effective df_survey (post-heterogeneity, - # post-all-IF-sites) into survey_metadata so downstream - # consumers — HonestDiD bounds (honest_did.py:973 reads - # results.survey_metadata.df_survey), exported metadata, and - # users — all see the same df that the top-level dCDH inference - # actually used. Before this assignment, survey_metadata carries - # the design-level `resolved_survey.df_survey` from survey - # resolution; under replicate designs the effective df may be - # smaller (reduced via _replicate_n_valid_list). SurveyMetadata - # is a mutable @dataclass (diff_diff/survey.py:681), so direct - # attribute assignment is safe. - if survey_metadata is not None: - survey_metadata.df_survey = _effective_df_survey( - resolved_survey, _replicate_n_valid_list + # R2 P1b: Finalize replicate-df propagation across public + # surfaces. When heterogeneity is active, its n_valid is + # appended to `_replicate_n_valid_list` AFTER the main + # surfaces (overall / joiners / leavers / event study / + # placebo horizon) have already computed their t/p/CI fields + # with an intermediate `_df_survey`. If heterogeneity reports + # the smallest n_valid, the main-surface inference would be + # anti-conservative relative to `survey_metadata.df_survey` + # and HonestDiD. Re-run safe_inference with the FINAL + # effective df so every surface agrees. + _final_eff_df = _effective_df_survey( + resolved_survey, _replicate_n_valid_list + ) + if _replicate_n_valid_list: + _final_inf_df = _inference_df(_final_eff_df, resolved_survey) + overall_t, overall_p, overall_ci = safe_inference( + overall_att, overall_se, + alpha=self.alpha, df=_final_inf_df, ) + if joiners_available: + joiners_t, joiners_p, joiners_ci = safe_inference( + joiners_att, joiners_se, + alpha=self.alpha, df=_final_inf_df, + ) + if leavers_available: + leavers_t, leavers_p, leavers_ci = safe_inference( + leavers_att, leavers_se, + alpha=self.alpha, df=_final_inf_df, + ) + if multi_horizon_inference is not None: + for _lag_r2, _info_r2 in list(multi_horizon_inference.items()): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["effect"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 + if placebo_horizon_inference is not None: + for _lag_r2, _info_r2 in list(placebo_horizon_inference.items()): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["effect"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 + if heterogeneity_effects: + for _lag_r2, _info_r2 in list(heterogeneity_effects.items()): + if np.isfinite(_info_r2["se"]): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["beta"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 + + # Persist the final effective df_survey into survey_metadata so + # downstream consumers — HonestDiD bounds (honest_did.py:973 + # reads results.survey_metadata.df_survey), exported metadata, + # and users — all see the same df that the recomputed + # inference above used. SurveyMetadata is a mutable @dataclass + # (diff_diff/survey.py:681), so direct attribute assignment is + # safe. + if survey_metadata is not None: + survey_metadata.df_survey = _final_eff_df results = ChaisemartinDHaultfoeuilleResults( overall_att=effective_overall_att, @@ -3496,17 +3556,18 @@ def _compute_heterogeneity_test( obs_gids_raw = np.asarray(obs_survey_info["group_ids"]) obs_w_raw = np.asarray(obs_survey_info["weights"], dtype=np.float64) resolved = obs_survey_info["resolved"] - # df_s starts from the design-level df (n_psu - n_strata under TSL, - # R - 1 under replicate). It may be reduced further by replicate - # failures reported via replicate_n_valid_list (this function - # appends its own n_valid observations). - if ( - replicate_n_valid_list is not None - and len(replicate_n_valid_list) > 0 - ): - df_s = int(min(replicate_n_valid_list)) - 1 - else: - df_s = resolved.df_survey if resolved is not None else None + # df_s starts from the shared effective df (base-capped by + # design df). Under replicate designs the helper handles the + # min(resolved.df_survey, min(n_valid) - 1) reduction and + # preserves None when the base df is undefined (QR-rank ≤ 1). + # Using the helper here — rather than re-deriving locally — + # keeps heterogeneity's df consistent with the main dCDH + # surfaces (R2 P1a). `list(... or [])` avoids accidental + # mutation of the caller's shared tracker at this site; the + # explicit append happens inside the horizon loop below. + df_s = _effective_df_survey( + resolved, list(replicate_n_valid_list or []) + ) # Contract: only obs whose group is in the canonical post-filter # list contribute. Groups dropped upstream (Step 5b interior gaps, # Step 6 multi-switch) appear in obs_gids_raw but must be @@ -3674,9 +3735,17 @@ def _compute_heterogeneity_test( if replicate_n_valid_list is not None: replicate_n_valid_list.append(n_valid_het) # Reduce df_s to reflect this horizon's n_valid. - df_s_local = int(n_valid_het) - 1 if df_s is None else min( - df_s, int(n_valid_het) - 1 - ) + # R2 P1a: when the shared base-capped df_s is None + # (undefined base df, e.g., QR-rank ≤ 1), the + # heterogeneity df MUST stay None — per-site n_valid + # cannot rescue a rank-deficient design. The + # _inference_df wrapper at the safe_inference call + # below coerces None to 0 under replicate, forcing + # NaN inference. + if df_s is None: + df_s_local = None + else: + df_s_local = min(int(df_s), int(n_valid_het) - 1) else: var_s = compute_survey_if_variance(psi_obs, resolved) df_s_local = df_s @@ -3686,7 +3755,8 @@ def _compute_heterogeneity_test( else float("nan") ) t_stat, p_val, ci = safe_inference( - beta_het, se_het, alpha=alpha, df=df_s_local + beta_het, se_het, alpha=alpha, + df=_inference_df(df_s_local, resolved), ) results[l_h] = { @@ -5006,6 +5076,36 @@ def _validate_group_constant_strata_psu( ) +def _inference_df( + effective_df: Optional[int], + resolved_survey: Any, +) -> Optional[int]: + """Coerce an effective-df value for use in ``safe_inference(df=...)``. + + ``_effective_df_survey()`` returns ``None`` when the replicate + design's base df is undefined (QR-rank ≤ 1) or when the reduced + df would be < 1. ``safe_inference`` treats ``df=None`` as the + standard normal (z-inference) and only returns NaN for + ``df <= 0``. Under a replicate design, "undefined effective df" + MUST map to NaN inference per REGISTRY.md — NOT to z-inference. + + Returns: + - ``effective_df`` when defined (t-inference with that df). + - ``0`` when ``effective_df is None`` AND ``resolved_survey`` uses + replicate variance — forces ``safe_inference`` to return NaN + t/p/CI via its ``df <= 0`` branch. + - ``None`` otherwise (no survey, or TSL design where the resolver + always returns an int df — z-inference is never reachable here). + """ + if effective_df is not None: + return effective_df + if resolved_survey is None: + return None + if getattr(resolved_survey, "uses_replicate_variance", False): + return 0 + return None + + def _effective_df_survey( resolved_survey: Any, replicate_n_valid_list: List[int], diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 60444f9e..8a95f40b 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`. -- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. +- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning. diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index c07d2cd9..5c873611 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -798,3 +798,108 @@ def test_dropped_replicate_reduces_df(self, base_panel): f"Expected persisted df to reflect reduced rank (< R - 1 = " f"{R - 1}), got {res.survey_metadata.df_survey}." ) + + def test_rank_1_replicate_forces_nan_inference(self, base_panel): + """All replicate columns identical → QR-rank = 1 → design df + undefined (None). Even if per-IF replicate SEs come back + finite, every public inference field (t_stat / p_value / + conf_int) MUST be NaN — otherwise users get z-based inference + silently instead of the NaN contract documented in REGISTRY.md. + + R2 P0 regression: before the fix, ``_effective_df_survey`` + returned None, ``safe_inference(df=None)`` computed z-based + statistics, and a rank-1 replicate design with finite SEs + would produce valid-looking (but wrong) p-values/CIs. + """ + R = 8 + df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3) + # Make every replicate column identical → QR-rank = 1. + rep_col_shared = df["rep0"].copy() + for r in range(R): + df[f"rep{r}"] = rep_col_shared + sd = _build_replicate_design(R, "BRR") + resolved = sd.resolve(df) + assert resolved.df_survey is None, ( + "Rank-1 replicate matrix must have undefined design df" + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + # survey_metadata.df_survey stays None — the honest undefined + # signal for downstream consumers. + assert res.survey_metadata.df_survey is None + # Main dCDH inference fields are NaN via the _inference_df + # coercion (None → 0 under replicate → safe_inference NaN + # branch). + assert np.isnan(res.overall_t_stat) + assert np.isnan(res.overall_p_value) + assert not np.all(np.isfinite(res.overall_conf_int)) + # Event study horizon 1 is also NaN. + if 1 in res.event_study_effects: + info = res.event_study_effects[1] + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + + def test_heterogeneity_replicate_cross_surface_df_consistency( + self, base_panel, + ): + """With ``heterogeneity=`` active under a rank-deficient + replicate design, every public surface — top-level dCDH, + event study, heterogeneity, ``survey_metadata.df_survey``, + and HonestDiD — MUST use the SAME final effective df. + + R2 P1 regression: before the fix, the main surfaces + (``overall_*``, ``event_study_effects``) computed t/p/CI + with an intermediate ``_df_survey`` that did not include + heterogeneity's ``n_valid_het``. Then + ``survey_metadata.df_survey`` was overwritten with the + post-heterogeneity min — so HonestDiD used a different df + than the main dCDH surface. + """ + from scipy import stats as _stats + + R = 12 + df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=7) + # Rank deficiency: 2 duplicated pairs → rank = R - 2 = 10. + df["rep1"] = df["rep0"] + df["rep3"] = df["rep2"] + sd = _build_replicate_design(R, "BRR") + resolved = sd.resolve(df) + design_df = resolved.df_survey + assert design_df is not None and design_df < R - 1 + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + heterogeneity="x_het", + honest_did=True, + ) + final_df = res.survey_metadata.df_survey + # All sites report n_valid == R → effective df = min(design_df, R-1) + # = design_df. + assert final_df == design_df + # Verify overall inference was computed with df=final_df by + # checking p-value against t(final_df) — would disagree if + # df=R-1 or df=None was used. + t_stat = res.overall_att / res.overall_se + expected_p = 2 * _stats.t.sf(abs(t_stat), df=final_df) + assert res.overall_p_value == pytest.approx(expected_p, rel=1e-6) + # Same check for heterogeneity surface. + if res.heterogeneity_effects: + het_info = res.heterogeneity_effects[1] + if np.isfinite(het_info["se"]): + het_t = het_info["beta"] / het_info["se"] + expected_het_p = 2 * _stats.t.sf(abs(het_t), df=final_df) + assert het_info["p_value"] == pytest.approx( + expected_het_p, rel=1e-6 + ) + # HonestDiD bounds were computed from the same survey_metadata; + # asserting the result attribute is populated confirms the df + # flow-through was exercised. + assert res.honest_did_results is not None From b48ee534e3a0934d8767fa9f8c22eb24788ccbd6 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 10:25:29 -0400 Subject: [PATCH 07/11] Fix CI review R3: placebo-horizon key + registry df formula MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two non-blocker items from PR #311 AI review R3: - P2: `test_placebo_under_replicate` queried `placebo_event_study[1]` but placebo horizons are stored under NEGATIVE keys (REGISTRY.md line 546 — `-l` corresponds to backward outcome difference `Y_{g, F_g-1-l} - Y_{g, F_g-1}`; storage at `chaisemartin_dhaultfoeuille.py:2402` uses `-lag_l` as the key). The test's assertion silently skipped (`.get(1)` returned None) so the replicate placebo SE path was effectively unverified. Fixed to use key `-1` and added explicit assertions on `se`, `p_value`, and `conf_int`. - P3: REGISTRY line 653 described the replicate-effective df as `min(n_valid_across_sites) - 1`, but the actual rule (and the detailed heterogeneity note at line 618) is the capped form `min(resolved_survey.df_survey, min(n_valid) - 1)`. Updated the wording to include the design-df cap and the undefined-base-df NaN case — matching the code and line 618. Full regression: 316 passing. --- docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh_replicate_psu.py | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 8a95f40b..48c734f3 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -650,7 +650,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. - **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). Under replicate-weight designs, the same group-level IF expansion is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. -- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(n_valid_across_sites) - 1` propagates to t-critical values. +- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. --- diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index 5c873611..baa36120 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -266,7 +266,12 @@ def test_multi_horizon_under_replicate( assert np.isfinite(info["se"]) and info["se"] > 0 def test_placebo_under_replicate(self, replicate_design): - """Placebo DID^{pl}_l under replicate weights also carries finite SE.""" + """Placebo DID^{pl}_l under replicate weights carries finite SE, + p-value, and CI. Placebo horizons are stored under NEGATIVE + keys in ``placebo_event_study`` (see REGISTRY.md line 546 — + ``-l`` corresponds to backward outcome difference + ``Y_{g, F_g-1-l} - Y_{g, F_g-1}``). Pre-R3-fix this test used + key ``1`` and silently skipped the assertion.""" df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42) R = 20 df = replicate_design(df, R=R, method="BRR") @@ -280,10 +285,20 @@ def test_placebo_under_replicate(self, replicate_design): survey_design=sd, L_max=1, ) - if res.placebo_event_study: - info = res.placebo_event_study.get(1) - if info is not None and info.get("n_obs", 0) > 0: - assert np.isfinite(info["se"]) + assert res.placebo_event_study, ( + "Expected placebo_event_study to be populated for L_max=1 " + "reversible panel" + ) + # Negative key convention: placebo horizon l → key -l. + assert -1 in res.placebo_event_study, ( + f"Expected key -1 in placebo_event_study (got keys " + f"{sorted(res.placebo_event_study)})" + ) + info = res.placebo_event_study[-1] + assert info["n_obs"] > 0, "Placebo horizon should have observations" + assert np.isfinite(info["se"]) and info["se"] > 0 + assert np.isfinite(info["p_value"]) + assert np.all(np.isfinite(info["conf_int"])) def test_did_x_replicate(self, base_panel, replicate_design): """DID^X covariate adjustment flows through the Class A dispatch From bcc14c41e18ac4a09ce36f87e044857c7e829250 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 10:46:21 -0400 Subject: [PATCH 08/11] Fix CI review R4: late-arriving n_valid propagation + doc drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR #311 AI review R4 (1 × P2 + 2 × P3). P2: The cross-surface df consistency test used a duplicated-replicate design where `resolved.df_survey` binds as the cap BEFORE heterogeneity contributes. That scenario can't detect a broken late-propagation path. Added a monkeypatch-based regression (`test_heterogeneity_late_nvalid_propagates_to_all_surfaces`) that: 1. First counts the main-surface calls to `compute_replicate_if_variance` (no heterogeneity). 2. Then runs fit with `heterogeneity=` active and a patched helper that returns a SMALLER n_valid ONLY on the heterogeneity call (call index > main_count), leaving main surfaces at full n_valid. 3. Asserts every public surface (`overall_p_value`, `event_study_effects`, `placebo_event_study`, `heterogeneity_effects`, `survey_metadata.df_survey`) uses the reduced df via t(reduced_n_valid - 1) distribution. The test caught two real bugs that the duplicated-column test missed: - `effective_overall_t/p/ci` are set by copying from the raw `overall_t/p/ci` at line ~2331 (before heterogeneity runs) and are used as the results-constructor inputs at line ~2776. The R2 recompute block only reassigned `overall_*` — so `effective_overall_*` (what actually shipped) stayed on the pre-heterogeneity df. Fix: recompute `effective_overall_*` from `effective_overall_att/se` (covering both the normal and cost-benefit delta paths), and keep `overall_*` in sync for downstream consumers. - `placebo_event_study_dict[-lag]` holds VALUE copies of `placebo_horizon_inference[lag]`, so mutating the inner dict in the recompute block does NOT propagate to the public surface. Fix: explicitly update `placebo_event_study_dict[-lag]` alongside the mutation in the recompute loop. P3a: REGISTRY overview line 466 still said "Replicate weights and PSU-level bootstrap are deferred" — contradicting the updated checklist at lines 651-653. Rewrote to match the new contract. P3b: `diff_diff/guides/llms-full.txt:325` still said "Replicate weights and PSU-level bootstrap deferred". Updated to reflect replicate support + opt-in PSU-level Hall-Mammen bootstrap + the mutual-exclusion contract with `n_bootstrap > 0`. Full regression: 327 passing. --- diff_diff/chaisemartin_dhaultfoeuille.py | 28 +++++ diff_diff/guides/llms-full.txt | 2 +- docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh_replicate_psu.py | 137 +++++++++++++++++++++++ 4 files changed, 167 insertions(+), 2 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index d4a6fb15..ed6c459f 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -2706,6 +2706,21 @@ def fit( ) if _replicate_n_valid_list: _final_inf_df = _inference_df(_final_eff_df, resolved_survey) + # Recompute `effective_overall_*` directly — that's what + # ships in results at line ~2776+. `effective_overall_att/se` + # may differ from the raw `overall_att/se` under the delta + # (cost-benefit) path at L_max >= 2; recomputing from + # `effective_*` ensures both paths get the final df. + if np.isfinite(effective_overall_se): + effective_overall_t, effective_overall_p, effective_overall_ci = ( + safe_inference( + effective_overall_att, effective_overall_se, + alpha=self.alpha, df=_final_inf_df, + ) + ) + # Keep `overall_*` in sync for any downstream code that + # reads them directly (e.g., placebo_event_study_dict + # construction flows from overall_*). overall_t, overall_p, overall_ci = safe_inference( overall_att, overall_se, alpha=self.alpha, df=_final_inf_df, @@ -2738,6 +2753,19 @@ def fit( _info_r2["t_stat"] = _t_r2 _info_r2["p_value"] = _p_r2 _info_r2["conf_int"] = _ci_r2 + # `placebo_event_study_dict` holds VALUE copies + # (not shared references) of the inner dicts, so + # mutating `placebo_horizon_inference[lag]` above + # does NOT propagate to the public surface. Update + # the negative-key mirror explicitly so the + # recomputed t/p/CI ship in results. + if ( + placebo_event_study_dict is not None + and -_lag_r2 in placebo_event_study_dict + ): + placebo_event_study_dict[-_lag_r2]["t_stat"] = _t_r2 + placebo_event_study_dict[-_lag_r2]["p_value"] = _p_r2 + placebo_event_study_dict[-_lag_r2]["conf_int"] = _ci_r2 if heterogeneity_effects: for _lag_r2, _info_r2 in list(heterogeneity_effects.items()): if np.isfinite(_info_r2["se"]): diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 5efb8f13..7e9a4517 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -322,7 +322,7 @@ print(f"sigma_fe (sign-flipping threshold): {diagnostic.sigma_fe:.3f}") - Validated against R `DIDmultiplegtDYN` v2.3.3 at horizon `l = 1` via `tests/test_chaisemartin_dhaultfoeuille_parity.py` - Phase 1 placebo SE is intentionally `NaN` with a warning. The dynamic companion paper Section 3.7.3 derives the cohort-recentered analytical variance for `DID_l` only — not for the placebo `DID_M^pl`. Phase 2 will add multiplier-bootstrap support for the placebo. Until then, the placebo point estimate is meaningful but its inference fields stay NaN-consistent **even when `n_bootstrap > 0`** (bootstrap currently covers `DID_M`, `DID_+`, and `DID_-` only) - The analytical CI is conservative under Assumption 8 (independent groups) of the dynamic companion paper, exact only under iid sampling -- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred +- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR). Opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0`. Replicate + `n_bootstrap > 0` rejected with `NotImplementedError` (replicate variance is closed-form) ### SunAbraham diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 48c734f3..8d1643ca 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -463,7 +463,7 @@ The multiplier bootstrap uses random weights w_i with E[w]=0 and Var(w)=1: - [de Chaisemartin, C. & D'Haultfœuille, X. (2020). Two-Way Fixed Effects Estimators with Heterogeneous Treatment Effects. *American Economic Review*, 110(9), 2964-2996.](https://doi.org/10.1257/aer.20181169) - [de Chaisemartin, C. & D'Haultfœuille, X. (2022, revised 2024). Difference-in-Differences Estimators of Intertemporal Treatment Effects. NBER Working Paper 29873.](https://www.nber.org/papers/w29873) — Web Appendix Section 3.7.3 contains the cohort-recentered plug-in variance formula implemented here. -**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization; replicate weights and PSU-level bootstrap are deferred. **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles. +**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR) across all IF sites, plus opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0` (see the full checklist + Notes below for the contract). **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles. **Key implementation requirements:** diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index baa36120..a5a4856f 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -918,3 +918,140 @@ def test_heterogeneity_replicate_cross_surface_df_consistency( # asserting the result attribute is populated confirms the df # flow-through was exercised. assert res.honest_did_results is not None + + def test_heterogeneity_late_nvalid_propagates_to_all_surfaces( + self, base_panel, monkeypatch, + ): + """Regression for PR #311 CI review R4 P2. + + The duplicated-column test above exercises the design-df cap + (which binds BEFORE heterogeneity runs), so it cannot catch a + bug where the post-heterogeneity recompute block fails to + propagate the final df to every public surface. This test + isolates the late-arriving case by monkeypatching + `compute_replicate_if_variance` to return a SMALLER n_valid + only on the heterogeneity call site — after main surfaces + have already frozen their t/p/CI fields with the larger df. + + Every public surface (`overall_*`, `event_study_effects`, + `placebo_event_study`, `heterogeneity_effects`, + `survey_metadata.df_survey`) must reflect the reduced df. + """ + from scipy import stats as _stats + from diff_diff import survey as _survey_mod + + R = 20 + reduced_n_valid = 7 # small enough that (reduced - 1) < R - 1 + df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5) + sd = _build_replicate_design(R, "JK1") + resolved = sd.resolve(df) + # Clean (full-rank) design — design df is NOT the binding cap. + assert resolved.df_survey == R - 1 + + original = _survey_mod.compute_replicate_if_variance + + # Pass 1: count the main-surface calls to + # `compute_replicate_if_variance` (L_max=1, no heterogeneity). + main_counter = {"n": 0} + + def count_only(psi, resolved_arg): + main_counter["n"] += 1 + return original(psi, resolved_arg) + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", count_only + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + main_call_count = main_counter["n"] + monkeypatch.undo() + + # Pass 2: reduce n_valid on calls STRICTLY AFTER the main + # surfaces (i.e., heterogeneity's call). + tracker = {"count": 0} + + def reduce_after_main(psi, resolved_arg): + tracker["count"] += 1 + var, n_valid = original(psi, resolved_arg) + if tracker["count"] > main_call_count: + return var, reduced_n_valid + return var, n_valid + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", + reduce_after_main, + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + heterogeneity="x_het", + ) + + # Heterogeneity's smaller n_valid should be the binding final + # df: min(R - 1, reduced_n_valid - 1) = reduced_n_valid - 1. + expected_df = reduced_n_valid - 1 + assert res.survey_metadata.df_survey == expected_df, ( + f"survey_metadata.df_survey should be {expected_df} " + f"(reduced_n_valid - 1), got {res.survey_metadata.df_survey}" + ) + + # Overall surface: p-value must use the reduced df. + t_stat = res.overall_att / res.overall_se + expected_p_overall = 2 * _stats.t.sf(abs(t_stat), df=expected_df) + assert res.overall_p_value == pytest.approx( + expected_p_overall, rel=1e-6, + ), ( + "overall_p_value should reflect heterogeneity's late " + "n_valid via the post-recompute block" + ) + + # Event study (L=1) surface. + if res.event_study_effects and 1 in res.event_study_effects: + info = res.event_study_effects[1] + if np.isfinite(info["se"]) and info["se"] > 0: + t_l = info["effect"] / info["se"] + expected_p_l = 2 * _stats.t.sf(abs(t_l), df=expected_df) + assert info["p_value"] == pytest.approx( + expected_p_l, rel=1e-6, + ) + + # Placebo event study surface (NEGATIVE keys). This is the + # one the copied-value bug would hide: `placebo_event_study` + # holds a VALUE copy of `placebo_horizon_inference`, so a + # broken recompute block would leave stale t/p/CI here. + if res.placebo_event_study: + for _lag_neg, pl_info in res.placebo_event_study.items(): + if np.isfinite(pl_info["se"]) and pl_info["se"] > 0: + t_pl = pl_info["effect"] / pl_info["se"] + expected_p_pl = 2 * _stats.t.sf( + abs(t_pl), df=expected_df, + ) + assert pl_info["p_value"] == pytest.approx( + expected_p_pl, rel=1e-6, + ), ( + f"placebo_event_study[{_lag_neg}] p-value " + f"must reflect the reduced df " + f"({expected_df}); got " + f"{pl_info['p_value']} vs expected " + f"{expected_p_pl}" + ) + + # Heterogeneity surface. + if res.heterogeneity_effects: + het_info = res.heterogeneity_effects[1] + if np.isfinite(het_info["se"]) and het_info["se"] > 0: + het_t = het_info["beta"] / het_info["se"] + expected_het_p = 2 * _stats.t.sf( + abs(het_t), df=expected_df, + ) + assert het_info["p_value"] == pytest.approx( + expected_het_p, rel=1e-6, + ) From abdfea4630e86fd7079870ee3602696871d2068f Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 11:01:39 -0400 Subject: [PATCH 09/11] Fix CI review R5: propagate final df to normalized_effects + all-NaN branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR #311 AI review R5 P0 + P2. P0: The R2 post-heterogeneity finalization block updated overall, joiners, leavers, multi-horizon event-study, placebo-horizon, and heterogeneity surfaces, but missed `normalized_effects_out` — which is built at line 2472-2487 from a delta-method SE (`SE(DID_l) / delta^D_l`) using the pre-heterogeneity `_df_survey`. Under a late-arriving replicate `n_valid` from heterogeneity, the public `results.normalized_effects` dict shipped stale p-values/CIs and could miss the all-NaN gate when the final replicate df turned undefined. Fix: extend the finalization block to also recompute every `normalized_effects_out[h]` entry with `_final_inf_df`. P2: Extended `test_heterogeneity_late_nvalid_propagates_to_all_ surfaces` to assert `res.normalized_effects` p-values match `t(final_df)` distribution. Added a new case `test_late_nvalid_below_two_forces_all_surfaces_nan` that drives the final `n_valid` to 1 (reduced df = 0 → None via `_inference_df` coercion) and verifies EVERY public surface (overall, event_study_effects, placebo_event_study, heterogeneity_effects, normalized_effects) flips to NaN — not silently keeps the finite values computed with the larger intermediate df. Full regression: 328 passing. --- diff_diff/chaisemartin_dhaultfoeuille.py | 13 +++ tests/test_survey_dcdh_replicate_psu.py | 109 +++++++++++++++++++++++ 2 files changed, 122 insertions(+) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index ed6c459f..e26b612c 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -2776,6 +2776,19 @@ def fit( _info_r2["t_stat"] = _t_r2 _info_r2["p_value"] = _p_r2 _info_r2["conf_int"] = _ci_r2 + # Normalized effects: another public surface built with the + # pre-heterogeneity `_df_survey`. Recompute inference with + # the final df so t/p/CI match the other surfaces (and the + # NaN contract when the final df becomes undefined). + if normalized_effects_out is not None: + for _lag_r2, _info_r2 in list(normalized_effects_out.items()): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["effect"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 # Persist the final effective df_survey into survey_metadata so # downstream consumers — HonestDiD bounds (honest_did.py:973 diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py index a5a4856f..595f80bd 100644 --- a/tests/test_survey_dcdh_replicate_psu.py +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -1055,3 +1055,112 @@ def reduce_after_main(psi, resolved_arg): assert het_info["p_value"] == pytest.approx( expected_het_p, rel=1e-6, ) + + # Normalized effects surface. This is a DERIVED dict built + # pre-heterogeneity (delta-method SE from per-horizon SE) — + # same class of "copy" bug as placebo_event_study. R4 P0 + # regression: without the recompute, normalized_effects would + # silently ship p_value/conf_int computed with the + # pre-heterogeneity df. + if res.normalized_effects: + for _lag, norm_info in res.normalized_effects.items(): + if np.isfinite(norm_info["se"]) and norm_info["se"] > 0: + norm_t = norm_info["effect"] / norm_info["se"] + expected_norm_p = 2 * _stats.t.sf( + abs(norm_t), df=expected_df, + ) + assert norm_info["p_value"] == pytest.approx( + expected_norm_p, rel=1e-6, + ), ( + f"normalized_effects[{_lag}] p-value must " + f"reflect the reduced df ({expected_df}); got " + f"{norm_info['p_value']} vs expected " + f"{expected_norm_p}" + ) + + def test_late_nvalid_below_two_forces_all_surfaces_nan( + self, base_panel, monkeypatch, + ): + """Regression for R4 P2 follow-up: when heterogeneity's late + replicate failures drive the final `n_valid` to 1 (reduced df + = 0), every public inference surface must flip to NaN — not + silently keep the finite values that were computed with a + larger intermediate df. + + `_inference_df` coerces df=0 under replicate → safe_inference + NaN branch. Exercises the all-NaN contract end to end. + """ + from diff_diff import survey as _survey_mod + + R = 15 + df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5) + sd = _build_replicate_design(R, "JK1") + resolved = sd.resolve(df) + assert resolved.df_survey == R - 1 + + original = _survey_mod.compute_replicate_if_variance + + # Count main-only calls. + main_counter = {"n": 0} + + def count_only(psi, resolved_arg): + main_counter["n"] += 1 + return original(psi, resolved_arg) + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", count_only + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + main_call_count = main_counter["n"] + monkeypatch.undo() + + # Return n_valid=1 on heterogeneity's call → reduced df = 0 + # → _inference_df coerces to NaN. + tracker = {"count": 0} + + def reduce_to_one(psi, resolved_arg): + tracker["count"] += 1 + var, n_valid = original(psi, resolved_arg) + if tracker["count"] > main_call_count: + return var, 1 + return var, n_valid + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", reduce_to_one + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + heterogeneity="x_het", + ) + # Final effective df is None (reduced_df = 0 < 1 guard). + assert res.survey_metadata.df_survey is None + # Every public inference surface must be NaN. + assert np.isnan(res.overall_t_stat) + assert np.isnan(res.overall_p_value) + assert not np.all(np.isfinite(res.overall_conf_int)) + if res.event_study_effects: + for info in res.event_study_effects.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + if res.placebo_event_study: + for info in res.placebo_event_study.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + if res.heterogeneity_effects: + for info in res.heterogeneity_effects.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + if res.normalized_effects: + for info in res.normalized_effects.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) From 41474069e87331a1019a4177666ca4cb52b80a9f Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 11:20:27 -0400 Subject: [PATCH 10/11] Fix CI review R6: warning gate on eligible groups + docstring updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR #311 AI review R6 (2 × P3 cleanups). P3 #1: Warning gate was computed from raw positive-weight groups, not the post-filter eligible-group set used to build the bootstrap PSU map. Panels where upstream dCDH filtering drops groups that share PSUs with kept groups could emit a misleading "PSU coarser than group" warning even when the effective bootstrap is one group per PSU. Fix: count PSUs and groups from `_eligible_group_ids` (the same set feeding `group_id_to_psu_code_bootstrap`), preserving the within- group-constant-PSU invariant by taking each eligible group's first positive-weight PSU label. P3 #2: Two docstrings said the bootstrap is "clustered at the group level" only — now incomplete after the PSU-level survey path: - `diff_diff/chaisemartin_dhaultfoeuille.py` class docstring: extended to note PSU-level Hall-Mammen wild clustering under `survey_design` with coarser PSU. - `diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py` module docstring: documents the identity-map fast path (auto-inject `psu=group`), the PSU-level broadcast when PSU is strictly coarser, and points to REGISTRY.md for the full contract. Full regression: 318 passing. --- diff_diff/chaisemartin_dhaultfoeuille.py | 44 ++++++++++++++----- .../chaisemartin_dhaultfoeuille_bootstrap.py | 17 ++++--- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index e26b612c..d449d59a 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -307,7 +307,10 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin): default; gate via ``placebo=False``) - Analytical SE via the cohort-recentered plug-in formula from Web Appendix Section 3.7.3 of the dynamic paper - - Optional multiplier bootstrap clustered at the group level + - Optional multiplier bootstrap, clustered at the group level by + default; under ``survey_design`` with a strictly-coarser PSU, the + bootstrap switches to PSU-level Hall-Mammen wild clustering (see + REGISTRY.md ChaisemartinDHaultfoeuille Note on survey + bootstrap) - Optional TWFE decomposition diagnostic from Theorem 1 of AER 2020 (per-cell weights, fraction negative, ``sigma_fe``) @@ -1970,23 +1973,40 @@ def fit( # psu=), groups and PSUs coincide so # Hall-Mammen wild PSU bootstrap equals group-level # multiplier bootstrap — no need to warn. + # Count groups/PSUs on the POST-FILTER eligible set + # (`_eligible_group_ids`) — the same set the bootstrap + # map is built from below. Using raw positive-weight + # groups here would emit a misleading warning on + # panels where upstream dCDH filtering drops groups + # that happen to share PSUs with kept groups. psu_arr_warn = getattr(resolved_survey, "psu", None) if psu_arr_warn is None or _obs_survey_info is None: # No PSU info — can't compare to group count. n_psu_eff_warn, n_groups_eff_warn = -1, -1 else: - pos_mask_warn = ( - np.asarray( - _obs_survey_info["weights"], dtype=np.float64 - ) - > 0 + obs_gids_warn = np.asarray(_obs_survey_info["group_ids"]) + obs_ws_warn = np.asarray( + _obs_survey_info["weights"], dtype=np.float64 + ) + pos_mask_warn = obs_ws_warn > 0 + psu_codes_warn = np.asarray(psu_arr_warn) + # Collect the PSU label for each variance-eligible + # group (within-group-constant PSU is validated + # upstream, so the first positive-weight label + # represents the whole group). + eligible_psu_labels: List[Any] = [] + for gid in _eligible_group_ids: + mask_g = (obs_gids_warn == gid) & pos_mask_warn + if mask_g.any(): + eligible_psu_labels.append( + psu_codes_warn[mask_g][0] + ) + n_groups_eff_warn = len(eligible_psu_labels) + n_psu_eff_warn = ( + int(len(np.unique(np.asarray(eligible_psu_labels)))) + if eligible_psu_labels + else -1 ) - psu_eff = np.asarray(psu_arr_warn)[pos_mask_warn] - gids_eff_warn = np.asarray( - _obs_survey_info["group_ids"] - )[pos_mask_warn] - n_psu_eff_warn = int(len(np.unique(psu_eff))) - n_groups_eff_warn = int(len(np.unique(gids_eff_warn))) if 0 <= n_psu_eff_warn < n_groups_eff_warn: warnings.warn( f"Bootstrap with survey_design uses Hall-Mammen " diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py index a456890e..27552296 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py @@ -4,11 +4,18 @@ The dCDH papers prescribe only the analytical cohort-recentered plug-in variance from Web Appendix Section 3.7.3 of the dynamic companion paper. -This module adds an opt-in multiplier bootstrap clustered at the group -level, matching the inference convention used by ``CallawaySantAnna``, -``ImputationDiD``, and ``TwoStageDiD``. The bootstrap is a library -extension, not a paper requirement, and is documented as such in -``REGISTRY.md``. +This module adds an opt-in multiplier bootstrap, clustered at the group +level by default (matching the inference convention used by +``CallawaySantAnna``, ``ImputationDiD``, and ``TwoStageDiD``). Under +``survey_design`` with an explicitly-coarser PSU, the bootstrap switches +to PSU-level Hall-Mammen wild clustering: each PSU draws a single +multiplier and all groups within that PSU share it +(see ``_generate_psu_or_group_weights`` and ``_map_for_target`` below, +plus the REGISTRY.md ``ChaisemartinDHaultfoeuille`` Note on survey + +bootstrap). Under the default auto-inject ``psu=group`` each group is +its own PSU and the identity-map fast path reproduces the original +group-level behavior bit-for-bit. The bootstrap is a library extension, +not a paper requirement, and is documented as such in ``REGISTRY.md``. The mixin operates on **pre-computed cohort-centered influence-function values**: the main estimator class computes per-group ``U^G_g`` values From 9368ae65b89c347e469c9d711607e50acf434a5a Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 11:38:33 -0400 Subject: [PATCH 11/11] Fix CI review R7: reconcile stale cluster/placebo docstrings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR #311 AI review R7 (2 × P3 doc drift cleanups). R7 P3 #1: Several sites still said dCDH "always clusters at the group level" — which was true when the PR was written but is now incomplete given the PSU-level Hall-Mammen wild bootstrap path under `survey_design`. Updated to distinguish user-specified `cluster=` (still unsupported, raises NotImplementedError) from automatic PSU-level clustering (takes over under `survey_design` with strictly-coarser PSUs; identity under auto-inject `psu=group`): - `docs/methodology/REGISTRY.md:592` Note (cluster contract) — rewrote to describe both paths; dropped "Phase 1" framing. - `docs/methodology/REGISTRY.md:636` checklist — added the automatic PSU-level upgrade clause. - `diff_diff/chaisemartin_dhaultfoeuille.py:321` constructor docstring — same contract split. - `diff_diff/chaisemartin_dhaultfoeuille.py:432` / `:503` `cluster=` error messages — removed "Phase 1" phrasing, added PSU-level-under-survey_design context. - `tests/test_chaisemartin_dhaultfoeuille.py:405` regex updated to match the new error wording (no longer pins "Phase 1"). R7 P3 #2: `diff_diff/guides/llms-full.txt:321` said Phase 2 will add multiplier-bootstrap support for placebo and bootstrap covers `DID_M`, `DID_+`, `DID_-` only — both stale after this PR's L_max >= 1 placebo and event-study bootstrap paths. Rewrote to scope the NaN-SE contract to `L_max=None` only and describe the full bootstrap coverage (overall, joiners, leavers, per-horizon event-study, placebo horizons, shared weights for sup-t bands). Full regression: 336 passing. --- diff_diff/chaisemartin_dhaultfoeuille.py | 45 +++++++++++++---------- diff_diff/guides/llms-full.txt | 2 +- docs/methodology/REGISTRY.md | 4 +- tests/test_chaisemartin_dhaultfoeuille.py | 29 ++++++++------- 4 files changed, 45 insertions(+), 35 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index d449d59a..d204645c 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -319,14 +319,17 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin): alpha : float, default=0.05 Significance level for confidence intervals. cluster : str, optional, default=None - **Phase 1 contract:** ``cluster`` must be ``None`` (the default). - dCDH always clusters at the group level via the cohort-recentered - influence-function plug-in (analytical SEs) and the multiplier - bootstrap (also grouped at the ``group`` column). Passing any - non-``None`` value raises ``NotImplementedError`` with a Phase 1 - pointer. Custom clustering at a coarser or finer level than the - group is reserved for a future phase. See REGISTRY.md - ``ChaisemartinDHaultfoeuille`` section for the full contract. + **Cluster contract:** ``cluster`` must be ``None`` (the default). + dCDH clusters at the group level by default via the cohort- + recentered influence-function plug-in (analytical SEs) and the + multiplier bootstrap. Passing any non-``None`` value raises + ``NotImplementedError`` — user-specified custom clustering is + reserved for a future phase. When ``survey_design`` is provided + with strictly-coarser PSUs, the multiplier bootstrap + automatically upgrades to PSU-level Hall-Mammen wild clustering + (see REGISTRY.md ``ChaisemartinDHaultfoeuille`` Note on survey + + bootstrap). Under the default auto-inject ``psu=group``, the + group-level and PSU-level paths are bit-identical. n_bootstrap : int, default=0 Number of multiplier-bootstrap iterations. ``0`` (default) uses only the analytical SE. Set to ``999`` or higher for stable @@ -431,12 +434,14 @@ def __init__( raise ValueError(f"n_bootstrap must be non-negative, got {n_bootstrap}") if cluster is not None: raise NotImplementedError( - f"cluster={cluster!r}: custom clustering is not supported in " - f"Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters " - f"at the group level via the cohort-recentered influence-" - f"function plug-in (analytical SEs) and the multiplier " - f"bootstrap (also grouped at the group column). To use the " - f"supported group-level clustering, pass cluster=None (the " + f"cluster={cluster!r}: user-specified clustering is not " + f"supported in ChaisemartinDHaultfoeuille. dCDH clusters at " + f"the group level by default via the cohort-recentered " + f"influence-function plug-in (analytical SEs) and the " + f"multiplier bootstrap. Under survey_design with strictly-" + f"coarser PSUs, bootstrap clustering automatically upgrades " + f"to PSU-level Hall-Mammen wild. To use the default path, " + f"pass cluster=None (the " f"default). Custom clustering is reserved for a future " f"phase. See REGISTRY.md ChaisemartinDHaultfoeuille section " f"for the full contract." @@ -502,11 +507,13 @@ def set_params(self, **params: Any) -> "ChaisemartinDHaultfoeuille": raise ValueError(f"n_bootstrap must be non-negative, got {self.n_bootstrap}") if self.cluster is not None: raise NotImplementedError( - f"cluster={self.cluster!r}: custom clustering is not supported " - f"in Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters " - f"at the group level. To use the supported group-level " - f"clustering, pass cluster=None (the default). Custom clustering " - f"is reserved for a future phase. See REGISTRY.md " + f"cluster={self.cluster!r}: user-specified clustering is " + f"not supported in ChaisemartinDHaultfoeuille. dCDH clusters " + f"at the group level by default; under survey_design with " + f"strictly-coarser PSUs the bootstrap automatically upgrades " + f"to PSU-level Hall-Mammen wild clustering. Pass cluster=None " + f"(the default) to use this path. User-specified custom " + f"clustering is reserved for a future phase. See REGISTRY.md " f"ChaisemartinDHaultfoeuille section for the full contract." ) return self diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 7e9a4517..e14c4d7f 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -320,7 +320,7 @@ print(f"sigma_fe (sign-flipping threshold): {diagnostic.sigma_fe:.3f}") **Notes:** - Validated against R `DIDmultiplegtDYN` v2.3.3 at horizon `l = 1` via `tests/test_chaisemartin_dhaultfoeuille_parity.py` -- Phase 1 placebo SE is intentionally `NaN` with a warning. The dynamic companion paper Section 3.7.3 derives the cohort-recentered analytical variance for `DID_l` only — not for the placebo `DID_M^pl`. Phase 2 will add multiplier-bootstrap support for the placebo. Until then, the placebo point estimate is meaningful but its inference fields stay NaN-consistent **even when `n_bootstrap > 0`** (bootstrap currently covers `DID_M`, `DID_+`, and `DID_-` only) +- Single-period placebo `DID_M^pl` (i.e. `L_max=None`) SE is intentionally `NaN` with a warning — the per-period placebo aggregation has no influence-function derivation. Multi-horizon placebos (`L_max >= 1`) have analytical SE (cohort-recentered IF, library extension beyond the paper's Theorem 1) and bootstrap SE via the shared multiplier-bootstrap path. Bootstrap coverage at `L_max >= 1` spans `DID_M`, `DID_+`, `DID_-`, per-horizon event-study effects (`event_study_effects[l]`), and placebo horizons (`placebo_event_study[-l]`), with shared weights across horizons for valid joint (sup-t) bands - The analytical CI is conservative under Assumption 8 (independent groups) of the dynamic companion paper, exact only under iid sampling - Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR). Opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0`. Replicate + `n_bootstrap > 0` rejected with `NotImplementedError` (replicate variance is closed-form) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 8d1643ca..86008e37 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -589,7 +589,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note:** When every variance-eligible group forms its own `(D_{g,1}, F_g, S_g)` cohort (a degenerate small-panel case where the cohort framework has zero degrees of freedom), the cohort-recentered plug-in formula is unidentified: cohort recentering subtracts the cohort mean from each group's `U^G_g`, and for singleton cohorts the centered value is exactly zero, so the centered influence function vector collapses to all zeros. The estimator returns `overall_se = NaN` with a `UserWarning` rather than silently collapsing to `0.0` (which would falsely imply infinite precision). The `DID_M` point estimate remains well-defined. The bootstrap path inherits the same degeneracy on these panels — the multiplier weights act on an all-zero vector, so the bootstrap distribution is also degenerate. **Deviation from R `DIDmultiplegtDYN`:** R returns a non-zero SE on the canonical 4-group worked example via small-sample sandwich machinery that Python does not implement. Both responses are valid for a degenerate case; Python's `NaN`+warning is the safer default. To get a non-degenerate SE, include more groups so cohorts have peers (real-world panels typically have `G >> K`). -- **Note (Phase 1 cluster contract):** `ChaisemartinDHaultfoeuille` always clusters at the group level. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group; both inference paths cluster at the user's `group` column with no other option. The constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` with a Phase 1 pointer at construction time (and the same gate fires from `set_params`). Custom clustering at a coarser or finer level than the group is reserved for a future phase. The matching test is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`. +- **Note (cluster contract):** `ChaisemartinDHaultfoeuille` clusters at the group level by default. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group. The user-facing `cluster=` kwarg is not supported: the constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` at construction time (and the same gate fires from `set_params`) — custom user-specified clustering is reserved for a future phase. **Automatic PSU-level clustering under `survey_design`:** when `survey_design` carries PSUs strictly coarser than the group and `n_bootstrap > 0`, the multiplier bootstrap automatically switches to Hall-Mammen wild PSU clustering (see the Note on survey + bootstrap below). Under the default auto-inject `psu=group`, PSU coincides with group and the group-level and PSU-level paths are identical (bit-for-bit via the identity-map fast path). The matching test for the `cluster=` gate is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`. - **Note (bootstrap inference surface):** When `n_bootstrap > 0`, the top-level `results.overall_p_value` / `results.overall_conf_int` (and joiners/leavers analogues) hold **percentile-based bootstrap inference** computed by the multiplier bootstrap, NOT normal-theory recomputations from the bootstrap SE. The t-stat (`overall_t_stat`, etc.) is computed from the SE via `safe_inference()[0]` to satisfy the project's anti-pattern rule (never compute `t = effect / se` inline) — bootstrap does not define an alternative t-stat semantic for percentile bootstrap, so the SE-based t-stat is the natural choice. `event_study_effects[1]`, `summary()`, `to_dataframe()`, `is_significant`, and `significance_stars` all read from these top-level fields and therefore reflect the bootstrap inference automatically. The library precedent for this propagation is `imputation.py:790-805`, `two_stage.py:778-787`, and `efficient_did.py:1009-1013`. The single-period placebo (`L_max=None`) still has NaN bootstrap fields; multi-horizon placebos (`L_max >= 1`) have valid bootstrap SE/CI/p via `placebo_horizon_ses/cis/p_values` on the bootstrap results object. The matching test is `test_bootstrap_p_value_and_ci_propagated_to_top_level` in `tests/test_chaisemartin_dhaultfoeuille.py::TestBootstrap`. @@ -633,7 +633,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Single-lag placebo `DID_M^pl` (point estimate; SE deferred to Phase 2) - [x] TWFE decomposition diagnostic (Theorem 1 of AER 2020): per-cell weights, fraction negative, `sigma_fe`, `beta_fe` - [x] Standalone `twowayfeweights()` helper for users who only want the TWFE diagnostic -- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group +- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group by default; automatically upgraded to PSU-level Hall-Mammen wild clustering under `survey_design` with strictly-coarser PSUs - [x] `drop_larger_lower=True` default (matches R `DIDmultiplegtDYN`); `False` opt-in with explicit inconsistency warning - [x] Singleton-baseline filter (footnote 15 of dynamic paper, variance computation only) with explicit warning - [x] Never-switching groups participate in the variance via stable-control roles after the Round 2 full-IF fix; `n_groups_dropped_never_switching` field retained as backwards-compatibility metadata only diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index 1193ce60..081d9e92 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -404,32 +404,35 @@ def test_survey_design_rejects_fweight(self, data): def test_cluster_parameter_raises_not_implemented(self, data): """ - Per Phase 1 cluster contract: dCDH always clusters at the - group level via the cohort-recentered influence function - (analytical SEs) and the multiplier bootstrap (also grouped at - the group column). Custom clustering is not supported in - Phase 1. + Per the dCDH cluster contract: dCDH clusters at the group + level by default via the cohort-recentered influence function + (analytical SEs) and the multiplier bootstrap. Under + ``survey_design`` with strictly-coarser PSUs the bootstrap + automatically upgrades to PSU-level Hall-Mammen wild. User- + specified clustering via the ``cluster=`` kwarg is not + supported. The reviewer flagged that ``cluster`` was previously accepted on ``__init__`` and stored on ``self.cluster`` but never actually read by ``fit()`` or ``_compute_dcdh_bootstrap()``, - making it a silent no-op. This test pins the new contract: any + making it a silent no-op. This test pins the contract: any non-None cluster value raises ``NotImplementedError`` at construction time with a message naming the offending value - and pointing at the Phase 1 reservation. The same gate fires - from ``set_params``. + and pointing at the no-custom-clustering reservation. The + same gate fires from ``set_params``. - See REGISTRY.md ``Note (Phase 1 cluster contract)``. + See REGISTRY.md ``Note (cluster contract)``. """ + pattern = r"cluster.*(not supported|reserved for a future)" # __init__ rejects any non-None cluster - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): ChaisemartinDHaultfoeuille(cluster="state") - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): ChaisemartinDHaultfoeuille(cluster="unit") # set_params after construction also rejects est = ChaisemartinDHaultfoeuille() - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): est.set_params(cluster="state") # cluster=None still works (the only supported value) @@ -439,7 +442,7 @@ def test_cluster_parameter_raises_not_implemented(self, data): # The convenience function also rejects (forward-compat gate # propagates through the wrapper at __init__ time) - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): chaisemartin_dhaultfoeuille( data, outcome="outcome",