diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 07f6c6f3..cf89c6c3 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -309,7 +309,10 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin): (computed automatically by default; gate via ``placebo=False``) - Analytical SE via the cohort-recentered plug-in formula from Web Appendix Section 3.7.3; multiplier bootstrap clustered at the group - level via ``n_bootstrap`` + level by default via ``n_bootstrap``; under ``survey_design`` with + strictly-coarser PSUs the bootstrap automatically upgrades to + PSU-level Hall-Mammen wild clustering (see REGISTRY.md + ``ChaisemartinDHaultfoeuille`` Note on survey + bootstrap) - Normalized estimator ``DID^n_l``, cost-benefit aggregate ``delta``, and sup-t simultaneous confidence bands - Residualization-style covariate adjustment (``DID^X``) via @@ -317,8 +320,9 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin): ``trends_linear=True``, state-set-specific trends via ``trends_nonparam=``, heterogeneity testing, non-binary treatment, HonestDiD sensitivity integration on placebos via ``honest_did=True`` - - Survey support via ``survey_design=`` (pweight + strata/PSU/FPC with - Taylor-series linearization) + - Survey support via ``survey_design=``: pweight with strata/PSU/FPC + via Taylor Series Linearization (analytical) or replicate-weight + variance (BRR/Fay/JK1/JKn/SDR) - TWFE decomposition diagnostic from Theorem 1 of AER 2020 Only ``aggregate`` on :meth:`fit` still raises ``NotImplementedError``. @@ -328,13 +332,27 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin): alpha : float, default=0.05 Significance level for confidence intervals. cluster : str, optional, default=None - Must be ``None`` (the default). dCDH always clusters at the group - level via the cohort-recentered influence-function plug-in - (analytical SEs) and the multiplier bootstrap (also grouped at the - ``group`` column). Passing any non-``None`` value raises - ``NotImplementedError``. Custom clustering at a coarser or finer - level than the group is a planned extension. See REGISTRY.md - ``ChaisemartinDHaultfoeuille`` section for the full contract. + Must be ``None`` (the default). User-specified clustering via + this kwarg is not supported — passing any non-``None`` value + raises ``NotImplementedError`` at construction time (and the + same gate fires from ``set_params``). The effective clustering + depends on how you call ``fit()``: + + - **Default (no survey_design)**: clustered at the group level + via the cohort-recentered influence-function plug-in + (analytical SEs) and the multiplier bootstrap. + - **Under ``survey_design`` with auto-inject or explicit + ``psu=group``**: PSU coincides with the group and the + group-level and PSU-level paths are bit-identical. + - **Under ``survey_design`` with strictly-coarser PSUs**: the + multiplier bootstrap automatically upgrades to PSU-level + Hall-Mammen wild clustering. + + So dCDH does NOT always cluster at the group level — see + REGISTRY.md ``ChaisemartinDHaultfoeuille`` Notes on cluster + contract and survey + bootstrap for the full matrix. Custom + user-specified clustering at a coarser or finer level than the + group is a planned extension. n_bootstrap : int, default=0 Number of multiplier-bootstrap iterations. ``0`` (default) uses only the analytical SE. Set to ``999`` or higher for stable @@ -439,12 +457,14 @@ def __init__( raise ValueError(f"n_bootstrap must be non-negative, got {n_bootstrap}") if cluster is not None: raise NotImplementedError( - f"cluster={cluster!r}: custom clustering is not supported in " - f"Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters " - f"at the group level via the cohort-recentered influence-" - f"function plug-in (analytical SEs) and the multiplier " - f"bootstrap (also grouped at the group column). To use the " - f"supported group-level clustering, pass cluster=None (the " + f"cluster={cluster!r}: user-specified clustering is not " + f"supported in ChaisemartinDHaultfoeuille. dCDH clusters at " + f"the group level by default via the cohort-recentered " + f"influence-function plug-in (analytical SEs) and the " + f"multiplier bootstrap. Under survey_design with strictly-" + f"coarser PSUs, bootstrap clustering automatically upgrades " + f"to PSU-level Hall-Mammen wild. To use the default path, " + f"pass cluster=None (the " f"default). Custom clustering is reserved for a future " f"phase. See REGISTRY.md ChaisemartinDHaultfoeuille section " f"for the full contract." @@ -510,11 +530,13 @@ def set_params(self, **params: Any) -> "ChaisemartinDHaultfoeuille": raise ValueError(f"n_bootstrap must be non-negative, got {self.n_bootstrap}") if self.cluster is not None: raise NotImplementedError( - f"cluster={self.cluster!r}: custom clustering is not supported " - f"in Phase 1 of ChaisemartinDHaultfoeuille. dCDH always clusters " - f"at the group level. To use the supported group-level " - f"clustering, pass cluster=None (the default). Custom clustering " - f"is reserved for a future phase. See REGISTRY.md " + f"cluster={self.cluster!r}: user-specified clustering is " + f"not supported in ChaisemartinDHaultfoeuille. dCDH clusters " + f"at the group level by default; under survey_design with " + f"strictly-coarser PSUs the bootstrap automatically upgrades " + f"to PSU-level Hall-Mammen wild clustering. Pass cluster=None " + f"(the default) to use this path. User-specified custom " + f"clustering is reserved for a future phase. See REGISTRY.md " f"ChaisemartinDHaultfoeuille section for the full contract." ) return self @@ -615,11 +637,25 @@ def fit( ``drop_larger_lower=False`` to retain 2-switch groups. survey_design : SurveyDesign, optional Survey design specification for design-based inference. - Supports pweight with strata/PSU/FPC via Taylor Series - Linearization. Survey weights produce weighted cell means - for the point estimate; the IF-based variance accounts for - the survey design structure. Replicate weights are not yet - supported. + Supports ``weight_type='pweight'`` with two variance paths: + (1) Taylor Series Linearization using strata / PSU / FPC + (analytical) and (2) replicate-weight variance using + BRR / Fay / JK1 / JKn / SDR methods (analytical, + closed-form). Survey weights produce weighted cell means + for the point estimate. Under a survey design without an + explicit ``psu``, ``fit()`` auto-injects ``psu=`` + so the group is the effective sampling unit (strata / PSU + must be within-group-constant; mixed labels within a group + raise ``ValueError``). When ``n_bootstrap > 0`` and a + survey design is supplied, the multiplier bootstrap + operates at the PSU level (Hall-Mammen wild PSU + bootstrap) — under the default auto-inject this collapses + to a group-level clustered bootstrap. **Replicate weights + combined with ``n_bootstrap > 0`` are rejected** with + ``NotImplementedError`` (replicate variance is + closed-form; bootstrap would double-count variance). See + REGISTRY.md ``ChaisemartinDHaultfoeuille`` Notes for the + full contract. Returns ------- @@ -725,13 +761,15 @@ def fit( f"weight_type='pweight', got '{resolved_survey.weight_type}'. " f"The survey IF variance math assumes probability weights." ) - if (resolved_survey.replicate_weights is not None - and resolved_survey.replicate_weights.shape[1] > 0): - raise NotImplementedError( - "Replicate weight variance for dCDH is not yet supported. " - "Use strata/PSU/FPC for design-based inference via Taylor " - "Series Linearization." - ) + # Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are supported + # for analytical variance via compute_replicate_if_variance() + # at each IF site (see _survey_se_from_group_if). The combination + # of replicate weights and n_bootstrap > 0 is rejected inside + # the bootstrap entry block (replicate variance is closed-form; + # bootstrap would double-count variance). Matches library + # precedent: efficient_did.py:989, staggered.py:1869, + # two_stage.py:251-253. + # Within-group-constant PSU/strata is required: the IF # expansion psi_i = U[g] * (w_i / W_g) supports within-group # variation in WEIGHTS (each obs contributes proportionally), @@ -778,6 +816,16 @@ def fit( "resolved": resolved_survey, } + # Replicate-weight variance tracker: each _compute_se call under + # a replicate design returns n_valid (number of replicate columns + # that produced a finite estimate). The effective df_survey is + # min(n_valid) - 1 across all IF sites — matches the precedent in + # `diff_diff/efficient_did.py:1133-1135` and + # `diff_diff/triple_diff.py:676-686`. Under TSL (analytical), + # _compute_se returns None for n_valid and df_survey falls through + # to resolved_survey.df_survey (= n_psu - n_strata). + _replicate_n_valid_list: List[int] = [] + # ------------------------------------------------------------------ # Step 4b: Covariate aggregation (DID^X, Web Appendix Section 1.2) # ------------------------------------------------------------------ @@ -1600,17 +1648,19 @@ def fit( U_centered_l = _cohort_recenter(U_l_elig, cid_elig) N_l_h = multi_horizon_dids[l_h]["N_l"] _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]] - se_l = _compute_se( + se_l, n_valid_l = _compute_se( U_centered=U_centered_l, divisor=N_l_h, obs_survey_info=_obs_survey_info, eligible_groups=_elig_groups_l, ) + if n_valid_l is not None: + _replicate_n_valid_list.append(n_valid_l) multi_horizon_se[l_h] = se_l did_l_val = multi_horizon_dids[l_h]["did_l"] - _df_s = resolved_survey.df_survey if resolved_survey is not None else None - t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_df_s) + _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) + t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_inference_df(_df_s, resolved_survey)) multi_horizon_inference[l_h] = { "effect": did_l_val, "se": se_l, @@ -1727,17 +1777,20 @@ def fit( cid_elig_pl = cid_pl[eligible_mask_pl] U_centered_pl_l = _cohort_recenter(U_pl_elig, cid_elig_pl) _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]] - se_pl_l = _compute_se( + se_pl_l, n_valid_pl_l = _compute_se( U_centered=U_centered_pl_l, divisor=pl_data["N_pl_l"], obs_survey_info=_obs_survey_info, eligible_groups=_elig_groups_pl, ) + if n_valid_pl_l is not None: + _replicate_n_valid_list.append(n_valid_pl_l) placebo_horizon_se[lag_l] = se_pl_l pl_val = pl_data["placebo_l"] - _df_s = resolved_survey.df_survey if resolved_survey is not None else None + _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) t_pl_l, p_pl_l, ci_pl_l = safe_inference( - pl_val, se_pl_l, alpha=self.alpha, df=_df_s + pl_val, se_pl_l, alpha=self.alpha, + df=_inference_df(_df_s, resolved_survey), ) placebo_horizon_inference[lag_l] = { "effect": pl_val, @@ -1810,12 +1863,14 @@ def fit( ) # Analytical SE for DID_M (survey-aware when survey_design provided) - overall_se = _compute_se( + overall_se, n_valid_overall = _compute_se( U_centered=U_centered_overall, divisor=N_S, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, ) + if n_valid_overall is not None: + _replicate_n_valid_list.append(n_valid_overall) # Detect the degenerate-cohort case: every variance-eligible group # forms its own (D_{g,1}, F_g, S_g) cohort, so the centered # influence function is identically zero and `_plugin_se` returns @@ -1840,23 +1895,26 @@ def fit( UserWarning, stacklevel=2, ) - _df_survey = ( - resolved_survey.df_survey if resolved_survey is not None else None - ) + _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) overall_t, overall_p, overall_ci = safe_inference( - overall_att, overall_se, alpha=self.alpha, df=_df_survey + overall_att, overall_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) # Joiners SE (uses joiner-only centered IF; conservative bound) if joiners_available: - joiners_se = _compute_se( + joiners_se, n_valid_joiners = _compute_se( U_centered=U_centered_joiners, divisor=joiner_total, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, ) + if n_valid_joiners is not None: + _replicate_n_valid_list.append(n_valid_joiners) + _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) joiners_t, joiners_p, joiners_ci = safe_inference( - joiners_att, joiners_se, alpha=self.alpha, df=_df_survey + joiners_att, joiners_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) else: joiners_se, joiners_t, joiners_p, joiners_ci = ( @@ -1868,14 +1926,18 @@ def fit( # Leavers SE if leavers_available: - leavers_se = _compute_se( + leavers_se, n_valid_leavers = _compute_se( U_centered=U_centered_leavers, divisor=leaver_total, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, ) + if n_valid_leavers is not None: + _replicate_n_valid_list.append(n_valid_leavers) + _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) leavers_t, leavers_p, leavers_ci = safe_inference( - leavers_att, leavers_se, alpha=self.alpha, df=_df_survey + leavers_att, leavers_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) else: leavers_se, leavers_t, leavers_p, leavers_ci = ( @@ -1916,14 +1978,81 @@ def fit( bootstrap_results: Optional[DCDHBootstrapResults] = None if self.n_bootstrap > 0: if resolved_survey is not None: - warnings.warn( - "Bootstrap with survey_design uses group-level multiplier " - "weights, not PSU-level. This is conservative when groups " - "are finer than PSUs. PSU-level survey bootstrap is " - "deferred to a future release.", - UserWarning, - stacklevel=2, - ) + # Replicate-weight designs have their own closed-form + # variance (Rao-Wu / BRR / Fay / JK1 / JKn / SDR via + # compute_replicate_if_variance). Combining replicate + # variance with a multiplier bootstrap would double-count + # variance. Match library precedent: + # efficient_did.py:989, staggered.py:1869, + # two_stage.py:251-253. + if ( + resolved_survey.replicate_weights is not None + and resolved_survey.replicate_weights.shape[1] > 0 + ): + raise NotImplementedError( + "dCDH survey support rejects the combination of " + "replicate weights and n_bootstrap > 0. Replicate-" + "weight variance is closed-form " + "(compute_replicate_if_variance); use n_bootstrap=0. " + "For a bootstrap-based SE, use strata/PSU/FPC design " + "instead of replicate weights." + ) + + # Warning fires only when PSU is strictly coarser than + # group. Under auto-inject psu=group (or explicit + # psu=), groups and PSUs coincide so + # Hall-Mammen wild PSU bootstrap equals group-level + # multiplier bootstrap — no need to warn. + # Count groups/PSUs on the POST-FILTER eligible set + # (`_eligible_group_ids`) — the same set the bootstrap + # map is built from below. Using raw positive-weight + # groups here would emit a misleading warning on + # panels where upstream dCDH filtering drops groups + # that happen to share PSUs with kept groups. + psu_arr_warn = getattr(resolved_survey, "psu", None) + if psu_arr_warn is None or _obs_survey_info is None: + # No PSU info — can't compare to group count. + n_psu_eff_warn, n_groups_eff_warn = -1, -1 + else: + obs_gids_warn = np.asarray(_obs_survey_info["group_ids"]) + obs_ws_warn = np.asarray( + _obs_survey_info["weights"], dtype=np.float64 + ) + pos_mask_warn = obs_ws_warn > 0 + psu_codes_warn = np.asarray(psu_arr_warn) + # Collect the PSU label for each variance-eligible + # group (within-group-constant PSU is validated + # upstream, so the first positive-weight label + # represents the whole group). + eligible_psu_labels: List[Any] = [] + for gid in _eligible_group_ids: + mask_g = (obs_gids_warn == gid) & pos_mask_warn + if mask_g.any(): + eligible_psu_labels.append( + psu_codes_warn[mask_g][0] + ) + n_groups_eff_warn = len(eligible_psu_labels) + n_psu_eff_warn = ( + int(len(np.unique(np.asarray(eligible_psu_labels)))) + if eligible_psu_labels + else -1 + ) + if 0 <= n_psu_eff_warn < n_groups_eff_warn: + warnings.warn( + f"Bootstrap with survey_design uses Hall-Mammen " + f"wild multiplier weights at the PSU level " + f"(n_psu={n_psu_eff_warn} PSUs across " + f"n_groups={n_groups_eff_warn} groups). For " + f"designs with substantially unequal PSU sizes, " + f"the wild bootstrap may under-cover relative to " + f"analytical TSL inference; consider " + f"n_bootstrap=0 for the TSL variance. If n_psu " + f"is close to n_groups, lonely-PSU removal " + f"(lonely_psu='remove') may be collapsing " + f"singletons.", + UserWarning, + stacklevel=2, + ) joiners_inputs = ( (U_centered_joiners, joiner_total, joiners_att) if joiners_available else None ) @@ -2039,6 +2168,52 @@ def fit( h_data["did_l"], ) + # Under a survey design with PSU information, build a + # `group_id_to_psu_code` dict so the bootstrap mixin can + # derive each target's PSU map by ID lookup rather than by + # positional reuse. PSU/strata are within-group-constant by + # the _validate_group_constant_strata_psu precondition, so + # each variance-eligible group has exactly one PSU label. + # Under auto-inject psu=group each group has a unique PSU + # code and the bootstrap mixin's identity-map fast path + # reproduces the pre-PSU behavior bit-for-bit. + group_id_to_psu_code_bootstrap: Optional[Dict[Any, int]] = None + eligible_group_ids_bootstrap: Optional[np.ndarray] = None + if ( + resolved_survey is not None + and getattr(resolved_survey, "psu", None) is not None + and _obs_survey_info is not None + ): + obs_psu_codes = np.asarray(resolved_survey.psu) + obs_gids_boot = np.asarray(_obs_survey_info["group_ids"]) + obs_weights_boot = np.asarray( + _obs_survey_info["weights"], dtype=np.float64 + ) + pos_mask_boot = obs_weights_boot > 0 + group_psu_labels: List[Any] = [] + valid_map = True + for gid in _eligible_group_ids: + mask_g = (obs_gids_boot == gid) & pos_mask_boot + if not mask_g.any(): + valid_map = False + break + labels = obs_psu_codes[mask_g] + # Within-group-constant PSU is validated upstream; + # the first label represents the whole group. + group_psu_labels.append(labels[0]) + if valid_map and len(group_psu_labels) == n_groups_for_overall_var: + # Factor PSU labels to dense integer codes. + _, dense_codes = np.unique( + np.asarray(group_psu_labels), + return_inverse=True, + ) + dense_codes = np.asarray(dense_codes, dtype=np.int64) + group_id_to_psu_code_bootstrap = { + gid: int(code) + for gid, code in zip(_eligible_group_ids, dense_codes) + } + eligible_group_ids_bootstrap = np.asarray(_eligible_group_ids) + br = self._compute_dcdh_bootstrap( n_groups_for_overall=n_groups_for_overall_var, u_centered_overall=U_centered_overall, @@ -2049,6 +2224,8 @@ def fit( placebo_inputs=placebo_inputs, multi_horizon_inputs=mh_boot_inputs, placebo_horizon_inputs=pl_boot_inputs, + group_id_to_psu_code=group_id_to_psu_code_bootstrap, + eligible_group_ids=eligible_group_ids_bootstrap, ) bootstrap_results = br @@ -2077,17 +2254,17 @@ def fit( overall_se = br.overall_se overall_p = br.overall_p_value if br.overall_p_value is not None else np.nan overall_ci = br.overall_ci if br.overall_ci is not None else (np.nan, np.nan) - overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_df_survey)[0] + overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] if joiners_available and br.joiners_se is not None and np.isfinite(br.joiners_se): joiners_se = br.joiners_se joiners_p = br.joiners_p_value if br.joiners_p_value is not None else np.nan joiners_ci = br.joiners_ci if br.joiners_ci is not None else (np.nan, np.nan) - joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_df_survey)[0] + joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] if leavers_available and br.leavers_se is not None and np.isfinite(br.leavers_se): leavers_se = br.leavers_se leavers_p = br.leavers_p_value if br.leavers_p_value is not None else np.nan leavers_ci = br.leavers_ci if br.leavers_ci is not None else (np.nan, np.nan) - leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_df_survey)[0] + leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] # ------------------------------------------------------------------ # Step 20: Build the results dataclass @@ -2249,7 +2426,8 @@ def fit( if np.isfinite(delta_se): effective_overall_se = delta_se effective_overall_t, effective_overall_p, effective_overall_ci = safe_inference( - delta_val, delta_se, alpha=self.alpha, df=_df_survey + delta_val, delta_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) else: effective_overall_se = float("nan") @@ -2283,7 +2461,8 @@ def fit( # Fallback: NaN SE (Phase 1 path or missing IF) pl_se = float("nan") pl_t, pl_p, pl_ci = safe_inference( - pl_data["placebo_l"], pl_se, alpha=self.alpha, df=_df_survey + pl_data["placebo_l"], pl_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) placebo_event_study_dict[-lag_l] = { "effect": pl_data["placebo_l"], @@ -2334,7 +2513,8 @@ def fit( bs_ci if bs_ci is not None else (np.nan, np.nan) ) placebo_event_study_dict[neg_key]["t_stat"] = safe_inference( - eff, bs_se, alpha=self.alpha, df=_df_survey + eff, bs_se, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), )[0] # Phase 2: build normalized_effects with SE @@ -2347,7 +2527,7 @@ def fit( # SE via delta method: SE(DID^n_l) = SE(DID_l) / delta^D_l se_did_l = multi_horizon_se.get(l_h, float("nan")) se_norm = se_did_l / denom if np.isfinite(denom) and denom > 0 else float("nan") - t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_df_survey) + t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey)) normalized_effects_out[l_h] = { "effect": eff, "se": se_norm, @@ -2406,7 +2586,8 @@ def fit( else: running_se_ub = float("nan") cum_t, cum_p, cum_ci = safe_inference( - cum_effect, running_se_ub, alpha=self.alpha, df=_df_survey + cum_effect, running_se_ub, alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), ) cumulated[l_h] = { "effect": cum_effect, @@ -2507,6 +2688,7 @@ def fit( rank_deficient_action=self.rank_deficient_action, group_ids_order=np.array(all_groups), obs_survey_info=_obs_survey_info, + replicate_n_valid_list=_replicate_n_valid_list, ) twfe_weights_df = None @@ -2559,6 +2741,115 @@ def fit( effective_joiners_available = False effective_leavers_available = False + # R2 P1b: Finalize replicate-df propagation across public + # surfaces. When heterogeneity is active, its n_valid is + # appended to `_replicate_n_valid_list` AFTER the main + # surfaces (overall / joiners / leavers / event study / + # placebo horizon) have already computed their t/p/CI fields + # with an intermediate `_df_survey`. If heterogeneity reports + # the smallest n_valid, the main-surface inference would be + # anti-conservative relative to `survey_metadata.df_survey` + # and HonestDiD. Re-run safe_inference with the FINAL + # effective df so every surface agrees. + _final_eff_df = _effective_df_survey( + resolved_survey, _replicate_n_valid_list + ) + if _replicate_n_valid_list: + _final_inf_df = _inference_df(_final_eff_df, resolved_survey) + # Recompute `effective_overall_*` directly — that's what + # ships in results at line ~2776+. `effective_overall_att/se` + # may differ from the raw `overall_att/se` under the delta + # (cost-benefit) path at L_max >= 2; recomputing from + # `effective_*` ensures both paths get the final df. + if np.isfinite(effective_overall_se): + effective_overall_t, effective_overall_p, effective_overall_ci = ( + safe_inference( + effective_overall_att, effective_overall_se, + alpha=self.alpha, df=_final_inf_df, + ) + ) + # Keep `overall_*` in sync for any downstream code that + # reads them directly (e.g., placebo_event_study_dict + # construction flows from overall_*). + overall_t, overall_p, overall_ci = safe_inference( + overall_att, overall_se, + alpha=self.alpha, df=_final_inf_df, + ) + if joiners_available: + joiners_t, joiners_p, joiners_ci = safe_inference( + joiners_att, joiners_se, + alpha=self.alpha, df=_final_inf_df, + ) + if leavers_available: + leavers_t, leavers_p, leavers_ci = safe_inference( + leavers_att, leavers_se, + alpha=self.alpha, df=_final_inf_df, + ) + if multi_horizon_inference is not None: + for _lag_r2, _info_r2 in list(multi_horizon_inference.items()): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["effect"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 + if placebo_horizon_inference is not None: + for _lag_r2, _info_r2 in list(placebo_horizon_inference.items()): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["effect"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 + # `placebo_event_study_dict` holds VALUE copies + # (not shared references) of the inner dicts, so + # mutating `placebo_horizon_inference[lag]` above + # does NOT propagate to the public surface. Update + # the negative-key mirror explicitly so the + # recomputed t/p/CI ship in results. + if ( + placebo_event_study_dict is not None + and -_lag_r2 in placebo_event_study_dict + ): + placebo_event_study_dict[-_lag_r2]["t_stat"] = _t_r2 + placebo_event_study_dict[-_lag_r2]["p_value"] = _p_r2 + placebo_event_study_dict[-_lag_r2]["conf_int"] = _ci_r2 + if heterogeneity_effects: + for _lag_r2, _info_r2 in list(heterogeneity_effects.items()): + if np.isfinite(_info_r2["se"]): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["beta"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 + # Normalized effects: another public surface built with the + # pre-heterogeneity `_df_survey`. Recompute inference with + # the final df so t/p/CI match the other surfaces (and the + # NaN contract when the final df becomes undefined). + if normalized_effects_out is not None: + for _lag_r2, _info_r2 in list(normalized_effects_out.items()): + _t_r2, _p_r2, _ci_r2 = safe_inference( + _info_r2["effect"], _info_r2["se"], + alpha=self.alpha, df=_final_inf_df, + ) + _info_r2["t_stat"] = _t_r2 + _info_r2["p_value"] = _p_r2 + _info_r2["conf_int"] = _ci_r2 + + # Persist the final effective df_survey into survey_metadata so + # downstream consumers — HonestDiD bounds (honest_did.py:973 + # reads results.survey_metadata.df_survey), exported metadata, + # and users — all see the same df that the recomputed + # inference above used. SurveyMetadata is a mutable @dataclass + # (diff_diff/survey.py:681), so direct attribute assignment is + # safe. + if survey_metadata is not None: + survey_metadata.df_survey = _final_eff_df + results = ChaisemartinDHaultfoeuilleResults( overall_att=effective_overall_att, overall_se=effective_overall_se, @@ -3289,6 +3580,7 @@ def _compute_heterogeneity_test( rank_deficient_action: str = "warn", group_ids_order: Optional[np.ndarray] = None, obs_survey_info: Optional[Dict[str, Any]] = None, + replicate_n_valid_list: Optional[List[int]] = None, ) -> Dict[int, Dict[str, Any]]: """Test for heterogeneous treatment effects (Web Appendix Section 1.5). @@ -3314,8 +3606,21 @@ def _compute_heterogeneity_test( Observation-level survey info with keys ``group_ids`` (raw per-row group labels), ``weights`` (per-row survey weights), and ``resolved`` (ResolvedSurveyDesign). When provided, the regression uses WLS with - per-group weights W_g = sum of obs survey weights, and SE is computed - via Binder TSL IF expansion through ``compute_survey_if_variance``. + per-group weights W_g = sum of obs survey weights. SE is computed + via Binder TSL IF expansion through ``compute_survey_if_variance`` + by default; under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), + dispatches to ``compute_replicate_if_variance`` for Rao-Wu-style + variance. The effective df for t-critical values follows the + site-level ``min(df_s, n_valid_het - 1)`` rule and the helper + mutates ``replicate_n_valid_list`` so the final + ``_effective_df_survey(...)`` sees this site's n_valid. + replicate_n_valid_list : list[int], optional + Shared accumulator for replicate-weight ``n_valid`` counts across + IF sites. When provided and a replicate design is in use, this + function appends its own ``n_valid_het`` before computing the + local effective df — so both the local inference fields and the + final ``survey_metadata.df_survey`` (set by ``fit()``) reflect + this site's contribution. Returns ------- @@ -3334,13 +3639,25 @@ def _compute_heterogeneity_test( obs_survey_info is not None and group_ids_order is not None ) if use_survey: - from diff_diff.survey import compute_survey_if_variance + from diff_diff.survey import ( + compute_replicate_if_variance, + compute_survey_if_variance, + ) obs_gids_raw = np.asarray(obs_survey_info["group_ids"]) obs_w_raw = np.asarray(obs_survey_info["weights"], dtype=np.float64) resolved = obs_survey_info["resolved"] - df_s = ( - resolved.df_survey if resolved is not None else None + # df_s starts from the shared effective df (base-capped by + # design df). Under replicate designs the helper handles the + # min(resolved.df_survey, min(n_valid) - 1) reduction and + # preserves None when the base df is undefined (QR-rank ≤ 1). + # Using the helper here — rather than re-deriving locally — + # keeps heterogeneity's df consistent with the main dCDH + # surfaces (R2 P1a). `list(... or [])` avoids accidental + # mutation of the caller's shared tracker at this site; the + # explicit append happens inside the horizon loop below. + df_s = _effective_df_survey( + resolved, list(replicate_n_valid_list or []) ) # Contract: only obs whose group is in the canonical post-filter # list contribute. Groups dropped upstream (Step 5b interior gaps, @@ -3495,15 +3812,42 @@ def _compute_heterogeneity_test( obs_w_raw[mask_g] / w_sum_g ) - # Binder TSL variance across stratified PSUs. - var_s = compute_survey_if_variance(psi_obs, resolved) + # Dispatch: replicate-weight variance (BRR/Fay/JK1/JKn/SDR) + # vs Binder TSL across stratified PSUs. Mirrors the inline + # branch in _survey_se_from_group_if and the pattern in + # TripleDifference:1206-1238. Heterogeneity uses WLS with + # full-sample weights; theta_hat is treated as fixed per the + # FWL plug-in IF convention (REGISTRY.md Note on heterogeneity + # under replicate — no per-replicate refits). + if getattr(resolved, "uses_replicate_variance", False): + var_s, n_valid_het = compute_replicate_if_variance( + psi_obs, resolved + ) + if replicate_n_valid_list is not None: + replicate_n_valid_list.append(n_valid_het) + # Reduce df_s to reflect this horizon's n_valid. + # R2 P1a: when the shared base-capped df_s is None + # (undefined base df, e.g., QR-rank ≤ 1), the + # heterogeneity df MUST stay None — per-site n_valid + # cannot rescue a rank-deficient design. The + # _inference_df wrapper at the safe_inference call + # below coerces None to 0 under replicate, forcing + # NaN inference. + if df_s is None: + df_s_local = None + else: + df_s_local = min(int(df_s), int(n_valid_het) - 1) + else: + var_s = compute_survey_if_variance(psi_obs, resolved) + df_s_local = df_s se_het = ( float(np.sqrt(var_s)) if np.isfinite(var_s) and var_s > 0 else float("nan") ) t_stat, p_val, ci = safe_inference( - beta_het, se_het, alpha=alpha, df=df_s + beta_het, se_het, alpha=alpha, + df=_inference_df(df_s_local, resolved), ) results[l_h] = { @@ -4823,24 +5167,93 @@ def _validate_group_constant_strata_psu( ) +def _inference_df( + effective_df: Optional[int], + resolved_survey: Any, +) -> Optional[int]: + """Coerce an effective-df value for use in ``safe_inference(df=...)``. + + ``_effective_df_survey()`` returns ``None`` when the replicate + design's base df is undefined (QR-rank ≤ 1) or when the reduced + df would be < 1. ``safe_inference`` treats ``df=None`` as the + standard normal (z-inference) and only returns NaN for + ``df <= 0``. Under a replicate design, "undefined effective df" + MUST map to NaN inference per REGISTRY.md — NOT to z-inference. + + Returns: + - ``effective_df`` when defined (t-inference with that df). + - ``0`` when ``effective_df is None`` AND ``resolved_survey`` uses + replicate variance — forces ``safe_inference`` to return NaN + t/p/CI via its ``df <= 0`` branch. + - ``None`` otherwise (no survey, or TSL design where the resolver + always returns an int df — z-inference is never reachable here). + """ + if effective_df is not None: + return effective_df + if resolved_survey is None: + return None + if getattr(resolved_survey, "uses_replicate_variance", False): + return 0 + return None + + +def _effective_df_survey( + resolved_survey: Any, + replicate_n_valid_list: List[int], +) -> Optional[int]: + """Compute the effective ``df_survey`` for t-critical values. + + Under replicate-weight designs, each IF site returns an ``n_valid`` + count (number of replicate columns that produced a finite estimate). + This helper **reduces** — never overwrites — the design-level + ``resolved_survey.df_survey`` (which for replicate designs is + ``QR-rank - 1`` per R's ``survey::degf()`` convention; see + ``diff_diff/survey.py:590``). Returns + ``min(resolved_survey.df_survey, min(n_valid) - 1)`` when both are + defined. + + Matches the precedent in ``diff_diff/efficient_did.py:1133-1135`` + and ``diff_diff/triple_diff.py:676-686`` (both reduce, not + replace). Under TSL (analytical) or non-survey fits, + ``replicate_n_valid_list`` is empty and this falls through to the + design-level df. Returns ``None`` when either the base df is + undefined (rank ≤ 1) or the reduced df would be < 1 — preserving + the NaN-inference contract from ``safe_inference``. + """ + if resolved_survey is None: + return None + base_df = resolved_survey.df_survey + if not replicate_n_valid_list: + return base_df + reduced_df = int(min(replicate_n_valid_list)) - 1 + if base_df is None or reduced_df < 1: + return None + return min(int(base_df), reduced_df) + + def _compute_se( U_centered: np.ndarray, divisor: int, obs_survey_info: Optional[dict], eligible_groups: Optional[list] = None, -) -> float: +) -> Tuple[float, Optional[int]]: """Dispatch to plug-in SE or survey-design-aware SE. When ``obs_survey_info`` is ``None``, falls back to the simple plug-in formula. Otherwise, expands group-level IFs and delegates - to TSL variance. + to TSL variance (or replicate-weight variance when the resolved + design carries replicate weights). + + Returns ``(se, n_valid_replicates)``. ``n_valid_replicates`` is + ``None`` under the plug-in / TSL paths, and the number of valid + replicate columns under the replicate path. """ if obs_survey_info is None: - return _plugin_se(U_centered=U_centered, divisor=divisor) + return _plugin_se(U_centered=U_centered, divisor=divisor), None if eligible_groups is None: - return _plugin_se(U_centered=U_centered, divisor=divisor) + return _plugin_se(U_centered=U_centered, divisor=divisor), None if divisor <= 0: - return float("nan") + return float("nan"), None # dCDH IFs are numerator-scale (U.sum() == N_S * DID_M). # compute_survey_if_variance() expects estimator-scale psi. # Scale by 1/divisor to normalize before survey expansion. @@ -4856,16 +5269,20 @@ def _survey_se_from_group_if( U_centered: np.ndarray, eligible_groups: list, obs_survey_info: dict, -) -> float: +) -> Tuple[float, Optional[int]]: """Compute survey-design-aware SE from group-level centered IFs. Expands group-level influence function values to observation level using the proportional-weight allocation ``psi_i = U[g] * (w_i / W_g)`` - so that ``sum(psi_i for i in g) = U[g]``, then delegates to - ``compute_survey_if_variance()`` for the TSL design-based variance. + so that ``sum(psi_i for i in g) = U[g]``, then delegates to either + ``compute_survey_if_variance()`` (TSL / analytical path) or + ``compute_replicate_if_variance()`` (BRR/Fay/JK1/JKn/SDR) depending + on whether the resolved design carries replicate weights. This is a library extension not in the dCDH papers (the paper's - plug-in variance assumes iid sampling). + plug-in variance assumes iid sampling). The replicate path uses + Rao-Wu weight-ratio rescaling; see REGISTRY.md + ``ChaisemartinDHaultfoeuille`` Note on survey expansion. Parameters ---------- @@ -4880,10 +5297,18 @@ def _survey_se_from_group_if( Returns ------- - float - Survey-design-aware SE, or NaN if degenerate. + Tuple[float, Optional[int]] + ``(se, n_valid_replicates)``. ``se`` is the survey-design-aware + SE, or NaN if degenerate. ``n_valid_replicates`` is the number + of valid replicate columns used for variance under the replicate + path, or ``None`` under the TSL (analytical) path. Callers + track the min across sites to set an effective ``df_survey`` + for replicate designs. """ - from diff_diff.survey import compute_survey_if_variance + from diff_diff.survey import ( + compute_replicate_if_variance, + compute_survey_if_variance, + ) # Degenerate-cohort contract (mirror _plugin_se): when the centered # IF is empty or every cohort is a singleton (→ recentered IF is @@ -4893,9 +5318,9 @@ def _survey_se_from_group_if( # through _compute_se (overall, joiners/leavers, multi-horizon # ATT, placebos, normalized/cumulated, heterogeneity). if U_centered.size == 0: - return float("nan") + return float("nan"), None if float((U_centered ** 2).sum()) <= 0: - return float("nan") + return float("nan"), None group_ids = obs_survey_info["group_ids"] weights = obs_survey_info["weights"] @@ -4913,7 +5338,7 @@ def _survey_se_from_group_if( psi = np.zeros(n_obs, dtype=np.float64) if not pos_mask.any(): - return float("nan") + return float("nan"), None gids_eff = np.asarray(group_ids)[pos_mask] w_eff = weights_arr[pos_mask] @@ -4933,10 +5358,20 @@ def _survey_se_from_group_if( safe_w = np.where(w_obs_total_eff > 0, w_obs_total_eff, 1.0) psi[pos_mask] = u_obs_eff * (w_eff / safe_w) + # Dispatch: replicate variance (BRR/Fay/JK1/JKn/SDR) vs TSL. + # Mirrors the inline branch in TripleDifference:1206-1238 and + # EfficientDiD:1119-1142. Under replicate, returns (variance, n_valid); + # the caller propagates min(n_valid) to df_survey. + if getattr(resolved, "uses_replicate_variance", False): + variance, n_valid = compute_replicate_if_variance(psi, resolved) + if not np.isfinite(variance) or variance < 0: + return float("nan"), n_valid + return float(np.sqrt(variance)), n_valid + variance = compute_survey_if_variance(psi, resolved) if not np.isfinite(variance) or variance < 0: - return float("nan") - return float(np.sqrt(variance)) + return float("nan"), None + return float(np.sqrt(variance)), None def _build_group_time_design( @@ -5228,6 +5663,11 @@ def twowayfeweights( (matching ``fit(..., survey_design=sd).twfe_*``). Required to preserve fit-vs-helper parity under survey-backed inputs. Only ``weight_type='pweight'`` is supported; other types raise ValueError. + Replicate-weight designs (BRR/Fay/JK1/JKn/SDR) are accepted — + the TWFE diagnostic has no SE field on ``TWFEWeightsResult``, + so replicate weights only affect the cell aggregation path + (aggregated numbers are identical to + ``fit(..., survey_design=sd).twfe_*`` under the same input). Returns ------- @@ -5251,16 +5691,13 @@ def twowayfeweights( f"The TWFE diagnostic under survey uses survey-weighted cell " f"means; other weight types are not supported." ) - if ( - resolved is not None - and resolved.replicate_weights is not None - and resolved.replicate_weights.shape[1] > 0 - ): - raise NotImplementedError( - "Replicate weight variance for twowayfeweights() is not " - "supported. Use strata/PSU/FPC for design-based inference " - "via Taylor Series Linearization (matches the fit() path)." - ) + # Replicate-weight designs are accepted. TWFE diagnostic has no + # SE field on TWFEWeightsResult — the replicate weights only + # participate through the full-sample weight (resolved.weights), + # which drives survey-weighted cell aggregation in + # _validate_and_aggregate_to_cells. Diagnostic numbers + # (beta_fe, sigma_fe, fraction_negative) match + # fit(..., survey_design=sd).twfe_* under replicate input. # Validation + cell aggregation via the same helper used by # ChaisemartinDHaultfoeuille.fit() — enforces NaN/binary/within-cell diff --git a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py index 7eea75cf..27552296 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_bootstrap.py @@ -4,11 +4,18 @@ The dCDH papers prescribe only the analytical cohort-recentered plug-in variance from Web Appendix Section 3.7.3 of the dynamic companion paper. -This module adds an opt-in multiplier bootstrap clustered at the group -level, matching the inference convention used by ``CallawaySantAnna``, -``ImputationDiD``, and ``TwoStageDiD``. The bootstrap is a library -extension, not a paper requirement, and is documented as such in -``REGISTRY.md``. +This module adds an opt-in multiplier bootstrap, clustered at the group +level by default (matching the inference convention used by +``CallawaySantAnna``, ``ImputationDiD``, and ``TwoStageDiD``). Under +``survey_design`` with an explicitly-coarser PSU, the bootstrap switches +to PSU-level Hall-Mammen wild clustering: each PSU draws a single +multiplier and all groups within that PSU share it +(see ``_generate_psu_or_group_weights`` and ``_map_for_target`` below, +plus the REGISTRY.md ``ChaisemartinDHaultfoeuille`` Note on survey + +bootstrap). Under the default auto-inject ``psu=group`` each group is +its own PSU and the identity-map fast path reproduces the original +group-level behavior bit-for-bit. The bootstrap is a library extension, +not a paper requirement, and is documented as such in ``REGISTRY.md``. The mixin operates on **pre-computed cohort-centered influence-function values**: the main estimator class computes per-group ``U^G_g`` values @@ -19,7 +26,7 @@ produce a bootstrap distribution per target. """ -from typing import TYPE_CHECKING, Dict, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple import numpy as np @@ -70,6 +77,9 @@ def _compute_dcdh_bootstrap( # --- Phase 2: multi-horizon inputs --- multi_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None, placebo_horizon_inputs: Optional[Dict[int, Tuple[np.ndarray, int, float]]] = None, + # --- Survey: PSU-level bootstrap under survey designs --- + group_id_to_psu_code: Optional[Dict[Any, int]] = None, + eligible_group_ids: Optional[np.ndarray] = None, ) -> DCDHBootstrapResults: """ Compute multiplier-bootstrap inference for all dCDH targets. @@ -160,6 +170,14 @@ def _compute_dcdh_bootstrap( ) rng = np.random.default_rng(self.seed) + # PSU label for each bootstrap weight column is derived from + # the group's ID via `_map_for_target`, not from positional + # truncation. All current dCDH bootstrap targets use the + # variance-eligible group ordering (`eligible_group_ids`); if a + # future target uses a different ordering, add a dedicated + # group-IDs parameter for it rather than reusing the overall + # eligible list. + # --- Overall DID_M --- # Skip the scalar DID_M bootstrap when divisor_overall <= 0 # (e.g., pure non-binary panels where N_S=0), but continue @@ -175,6 +193,11 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH overall DID_M bootstrap", return_distribution=True, + group_to_psu_map=_map_for_target( + u_centered_overall.shape[0], + group_id_to_psu_code, + eligible_group_ids, + ), ) else: overall_se = np.nan @@ -206,6 +229,9 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH joiners DID_+ bootstrap", return_distribution=False, + group_to_psu_map=_map_for_target( + u_j.size, group_id_to_psu_code, eligible_group_ids, + ), ) results.joiners_se = se_j results.joiners_ci = ci_j @@ -225,6 +251,9 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH leavers DID_- bootstrap", return_distribution=False, + group_to_psu_map=_map_for_target( + u_l.size, group_id_to_psu_code, eligible_group_ids, + ), ) results.leavers_se = se_l results.leavers_ci = ci_l @@ -244,6 +273,9 @@ def _compute_dcdh_bootstrap( rng=rng, context="dCDH placebo DID_M^pl bootstrap", return_distribution=False, + group_to_psu_map=_map_for_target( + u_pl.size, group_id_to_psu_code, eligible_group_ids, + ), ) results.placebo_se = se_pl results.placebo_ci = ci_pl @@ -259,19 +291,47 @@ def _compute_dcdh_bootstrap( es_pvals: Dict[int, float] = {} es_dists: Dict[int, np.ndarray] = {} - # Shared weight matrix sized for the group set + # Shared weight matrix sized for the group set. Under PSU-level + # bootstrap (Hall-Mammen wild PSU), weights are drawn once per + # PSU and broadcast to groups so all groups in the same PSU + # share a multiplier within a single bootstrap replicate — + # preserving the sup-t joint distribution across horizons. n_groups_mh = n_groups_for_overall - shared_weights = _generate_bootstrap_weights_batch( + shared_weights = _generate_psu_or_group_weights( n_bootstrap=self.n_bootstrap, - n_units=n_groups_mh, + n_groups_target=n_groups_mh, weight_type=self.bootstrap_weights, rng=rng, + group_to_psu_map=_map_for_target( + n_groups_mh, group_id_to_psu_code, eligible_group_ids, + ), ) for l_h, (u_h, n_h, eff_h) in sorted(multi_horizon_inputs.items()): if u_h.size > 0 and n_h > 0: - # Use the shared weight matrix truncated to u_h length - w_h = shared_weights[:, : u_h.size] + # Under the current contract every horizon's IF + # vector uses the variance-eligible group ordering + # from `eligible_group_ids`, so the shared weight + # matrix is already at the right shape. Assert + # this invariant so any future refactor that + # introduces horizon-specific masking fails loudly + # rather than silently misaligning PSU clusters via + # positional truncation. + if u_h.size != n_groups_mh: + raise ValueError( + f"Multi-horizon bootstrap: horizon {l_h} " + f"IF vector has {u_h.size} entries but " + f"shared weight matrix has {n_groups_mh} " + f"columns. dCDH's contract requires every " + f"horizon to use the variance-eligible " + f"group ordering; to support a horizon " + f"with a different ordering, thread " + f"target-specific group IDs through " + f"`multi_horizon_inputs` and project the " + f"shared PSU draws onto the horizon's own " + f"ordering via `_map_for_target`." + ) + w_h = shared_weights deviations = (w_h @ u_h) / n_h dist_h = deviations + eff_h @@ -324,6 +384,9 @@ def _compute_dcdh_bootstrap( rng=rng, context=f"dCDH placebo l={l_h} bootstrap", return_distribution=False, + group_to_psu_map=_map_for_target( + u_h.size, group_id_to_psu_code, eligible_group_ids, + ), ) pl_ses[l_h] = se_h pl_cis[l_h] = ci_h @@ -341,6 +404,125 @@ def _compute_dcdh_bootstrap( # ============================================================================= +def _map_for_target( + target_size: int, + group_id_to_psu_code: Optional[Dict[Any, int]], + eligible_group_ids: Optional[np.ndarray], +) -> Optional[np.ndarray]: + """Build a PSU map for a bootstrap target from IDs (not positions). + + The caller passes: + - ``group_id_to_psu_code``: a dict mapping each variance-eligible + group ID to its dense PSU code (built once in ``fit()``). + - ``eligible_group_ids``: the ordered list of group IDs that + correspond to the current target's ``u_centered`` vector. + + Returns an integer array of length ``target_size`` where entry + ``i`` is the PSU code for the ``i``-th contributing group. + + Returns ``None`` when no PSU information is available (plain + multiplier-bootstrap path — identity across targets). + + Raises ``ValueError`` if ``target_size`` does not match + ``len(eligible_group_ids)``: every current dCDH bootstrap target + uses the variance-eligible group ordering, so any size mismatch + signals that a caller introduced a target whose group subset + diverges and should pass its own ``target_group_ids`` rather than + reusing the overall eligible list. Also raises ``ValueError`` if + any group ID is missing from the dict (signaling misalignment + between the target's IF vector and the map's keys). + """ + if group_id_to_psu_code is None or eligible_group_ids is None: + return None + if target_size != len(eligible_group_ids): + raise ValueError( + f"Bootstrap target size ({target_size}) does not match " + f"eligible_group_ids length ({len(eligible_group_ids)}). " + "dCDH's bootstrap contract requires all current targets to " + "use the variance-eligible group ordering; if a new target " + "has a different ordering, pass target-specific group IDs " + "to _map_for_target rather than reusing eligible_group_ids." + ) + try: + return np.array( + [group_id_to_psu_code[gid] for gid in eligible_group_ids], + dtype=np.int64, + ) + except KeyError as e: + raise ValueError( + f"Group ID {e.args[0]!r} in eligible_group_ids has no entry " + f"in group_id_to_psu_code — PSU map is misaligned with the " + f"bootstrap target's group set." + ) from e + + +def _generate_psu_or_group_weights( + n_bootstrap: int, + n_groups_target: int, + weight_type: str, + rng: np.random.Generator, + group_to_psu_map: Optional[np.ndarray], +) -> np.ndarray: + """Generate a group-level weight matrix, possibly via PSU broadcasting. + + When ``group_to_psu_map`` is ``None`` or is the identity (each group + is its own PSU), generates weights at the group level directly — + bit-identical to the pre-PSU-bootstrap contract. + + When ``group_to_psu_map`` has fewer unique values than + ``n_groups_target`` (strictly coarser PSU than group), generates + weights at the PSU level and broadcasts to groups via the map. + This is the Hall-Mammen wild PSU bootstrap. + + Parameters + ---------- + n_bootstrap, weight_type, rng + Passed through to generate_bootstrap_weights_batch. + n_groups_target : int + Number of groups contributing to the target's IF vector. + group_to_psu_map : np.ndarray or None + Dense integer PSU indices of shape ``(n_groups_target,)``. + ``None`` triggers the group-level path. + + Returns + ------- + np.ndarray + Shape ``(n_bootstrap, n_groups_target)`` multiplier weights. + """ + if group_to_psu_map is None: + return _generate_bootstrap_weights_batch( + n_bootstrap=n_bootstrap, + n_units=n_groups_target, + weight_type=weight_type, + rng=rng, + ) + if len(group_to_psu_map) != n_groups_target: + raise ValueError( + f"group_to_psu_map length ({len(group_to_psu_map)}) does not " + f"match n_groups_target ({n_groups_target})." + ) + n_psu = int(np.max(group_to_psu_map)) + 1 if group_to_psu_map.size > 0 else 0 + if n_psu >= n_groups_target: + # Identity (each group its own PSU) — skip the broadcast for a + # bit-identical fast path matching the pre-PSU-bootstrap behavior. + return _generate_bootstrap_weights_batch( + n_bootstrap=n_bootstrap, + n_units=n_groups_target, + weight_type=weight_type, + rng=rng, + ) + # Hall-Mammen wild PSU bootstrap: draw n_psu multipliers, broadcast + # via the dense index map so all groups in the same PSU share a + # multiplier. Preserves clustered sampling structure. + psu_weights = _generate_bootstrap_weights_batch( + n_bootstrap=n_bootstrap, + n_units=n_psu, + weight_type=weight_type, + rng=rng, + ) + return psu_weights[:, group_to_psu_map] + + def _bootstrap_one_target( u_centered: np.ndarray, divisor: int, @@ -351,6 +533,7 @@ def _bootstrap_one_target( rng: np.random.Generator, context: str, return_distribution: bool, + group_to_psu_map: Optional[np.ndarray] = None, ) -> Tuple[float, Tuple[float, float], float, Optional[np.ndarray]]: """ Run the multiplier bootstrap for a single dCDH target. @@ -367,16 +550,24 @@ def _bootstrap_one_target( sample mean of the bootstrap distribution should be approximately zero, not the original effect). The original effect is passed separately as the centering point for the percentile p-value. + + When ``group_to_psu_map`` is provided (length ``len(u_centered)``, + dense integer PSU indices), multiplier weights are generated at the + PSU level and broadcast to groups so all groups in the same PSU + receive the same bootstrap multiplier. This is the Hall-Mammen wild + PSU bootstrap; it reduces to the group-level bootstrap when each + group is its own PSU (identity map). """ n_groups_target = u_centered.shape[0] if n_groups_target == 0 or divisor == 0: return np.nan, (np.nan, np.nan), np.nan, None - weight_matrix = _generate_bootstrap_weights_batch( + weight_matrix = _generate_psu_or_group_weights( n_bootstrap=n_bootstrap, - n_units=n_groups_target, + n_groups_target=n_groups_target, weight_type=weight_type, rng=rng, + group_to_psu_map=group_to_psu_map, ) # Each bootstrap replicate: (1 / divisor) * sum_g w_b[g] * u_centered[g] diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 7bbd950c..c9b2851e 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -266,7 +266,7 @@ est.fit( trends_nonparam: Any | None = None, # DID^s state-set-specific trends honest_did: bool = False, # HonestDiD sensitivity on placebos # ---- survey support ---- - survey_design: SurveyDesign | None = None, # pweight + strata/PSU/FPC (TSL) + survey_design: SurveyDesign | None = None, # pweight (TSL or replicate BRR/Fay/JK1/JKn/SDR); n_bootstrap > 0 uses Hall-Mammen PSU multiplier ) -> ChaisemartinDHaultfoeuilleResults ``` @@ -320,9 +320,9 @@ print(f"sigma_fe (sign-flipping threshold): {diagnostic.sigma_fe:.3f}") **Notes:** - Validated against R `DIDmultiplegtDYN` v2.3.3 at horizon `l = 1` via `tests/test_chaisemartin_dhaultfoeuille_parity.py` -- Placebo SE contract: single-period placebo `DID_M^pl` (`L_max=None`) has `NaN` SE because the per-period aggregation path has no influence-function derivation; inference fields stay NaN-consistent **even when `n_bootstrap > 0`** for the single-period path (bootstrap covers `DID_M`, `DID_+`, and `DID_-` only). Multi-horizon dynamic placebos `DID^{pl}_l` (`L_max >= 1`) have valid analytical SE via the placebo influence function (same cohort-recentered structure as positive horizons, applied to backward outcome differences), with bootstrap SE override when `n_bootstrap > 0`. This is a library extension beyond the dynamic companion paper, which states Theorem 1 variance for `DID_l` only. +- Placebo SE contract: single-period placebo `DID_M^pl` (`L_max=None`) has `NaN` SE because the per-period aggregation path has no influence-function derivation; inference fields stay NaN-consistent **even when `n_bootstrap > 0`** for the single-period path (single-period bootstrap covers only `DID_M`, `DID_+`, and `DID_-`). Multi-horizon dynamic placebos `DID^{pl}_l` (`L_max >= 1`) have valid analytical SE via the placebo influence function (same cohort-recentered structure as positive horizons, applied to backward outcome differences), with bootstrap SE override when `n_bootstrap > 0` — bootstrap at `L_max >= 1` covers `DID_M`, `DID_+`, `DID_-`, per-horizon event-study effects (`event_study_effects[l]`), and placebo horizons (`placebo_event_study[-l]`), with shared weights across horizons for valid joint (sup-t) bands. This is a library extension beyond the dynamic companion paper, which states Theorem 1 variance for `DID_l` only. - The analytical CI is conservative under Assumption 8 (independent groups) of the dynamic companion paper, exact only under iid sampling -- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization. Replicate weights and PSU-level bootstrap deferred +- Survey design supported: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR). Opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0`. Replicate + `n_bootstrap > 0` rejected with `NotImplementedError` (replicate variance is closed-form) ### SunAbraham @@ -1672,8 +1672,8 @@ sd_female, data_female = sd.subpopulation(data, mask=lambda df: df['sex'] == 'F' **Key features:** - Taylor Series Linearization (TSL) variance with strata + PSU + FPC -- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (12 of 16 estimators) -- Survey-aware bootstrap: multiplier at PSU or Rao-Wu rescaled +- Replicate weight variance: BRR, Fay's BRR, JK1, JKn, SDR (13 of 16 estimators, including dCDH) +- Survey-aware bootstrap: multiplier at PSU (Hall-Mammen wild; dCDH, staggered) or Rao-Wu rescaled (SyntheticDiD, SunAbraham) - DEFF diagnostics, subpopulation analysis, weight trimming (`trim_weights`) - Repeated cross-sections: `CallawaySantAnna(panel=False)` - Compatibility matrix: see `docs/choosing_estimator.rst` Survey Design Support section diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index b4cd419f..f1c575cf 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -463,7 +463,7 @@ The multiplier bootstrap uses random weights w_i with E[w]=0 and Var(w)=1: - [de Chaisemartin, C. & D'Haultfœuille, X. (2020). Two-Way Fixed Effects Estimators with Heterogeneous Treatment Effects. *American Economic Review*, 110(9), 2964-2996.](https://doi.org/10.1257/aer.20181169) - [de Chaisemartin, C. & D'Haultfœuille, X. (2022, revised 2024). Difference-in-Differences Estimators of Intertemporal Treatment Effects. NBER Working Paper 29873.](https://www.nber.org/papers/w29873) — Web Appendix Section 3.7.3 contains the cohort-recentered plug-in variance formula implemented here. -**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization; replicate weights and PSU-level bootstrap are deferred. **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles. +**Phase 1-2 scope:** Ships the contemporaneous-switch estimator `DID_M` (= `DID_1` at horizon `l = 1`) from the AER 2020 paper **plus** the full multi-horizon event study `DID_l` for `l = 1..L_max` from the dynamic companion paper. Phase 2 adds: per-group `DID_{g,l}` building block (Equation 3), dynamic placebos `DID^{pl}_l`, normalized estimator `DID^n_l`, cost-benefit aggregate `delta`, sup-t simultaneous confidence bands, and `plot_event_study()` integration. Phase 3 adds covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, and HonestDiD integration. Survey design supports pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) or replicate-weight variance (BRR/Fay/JK1/JKn/SDR) across all IF sites, plus opt-in PSU-level Hall-Mammen wild bootstrap via `n_bootstrap > 0` (see the full checklist + Notes below for the contract). **This is the only modern staggered estimator in the library that handles non-absorbing (reversible) treatments** - treatment can switch on AND off over time, making it the natural fit for marketing campaigns, seasonal promotions, on/off policy cycles. **Key implementation requirements:** @@ -589,7 +589,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note:** When every variance-eligible group forms its own `(D_{g,1}, F_g, S_g)` cohort (a degenerate small-panel case where the cohort framework has zero degrees of freedom), the cohort-recentered plug-in formula is unidentified: cohort recentering subtracts the cohort mean from each group's `U^G_g`, and for singleton cohorts the centered value is exactly zero, so the centered influence function vector collapses to all zeros. The estimator returns `overall_se = NaN` with a `UserWarning` rather than silently collapsing to `0.0` (which would falsely imply infinite precision). The `DID_M` point estimate remains well-defined. The bootstrap path inherits the same degeneracy on these panels — the multiplier weights act on an all-zero vector, so the bootstrap distribution is also degenerate. **Deviation from R `DIDmultiplegtDYN`:** R returns a non-zero SE on the canonical 4-group worked example via small-sample sandwich machinery that Python does not implement. Both responses are valid for a degenerate case; Python's `NaN`+warning is the safer default. To get a non-degenerate SE, include more groups so cohorts have peers (real-world panels typically have `G >> K`). -- **Note (Phase 1 cluster contract):** `ChaisemartinDHaultfoeuille` always clusters at the group level. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group; both inference paths cluster at the user's `group` column with no other option. The constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` with a Phase 1 pointer at construction time (and the same gate fires from `set_params`). Custom clustering at a coarser or finer level than the group is reserved for a future phase. The matching test is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`. +- **Note (cluster contract):** `ChaisemartinDHaultfoeuille` clusters at the group level by default. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group. The user-facing `cluster=` kwarg is not supported: the constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` at construction time (and the same gate fires from `set_params`) — custom user-specified clustering is reserved for a future phase. **Automatic PSU-level clustering under `survey_design`:** when `survey_design` carries PSUs strictly coarser than the group and `n_bootstrap > 0`, the multiplier bootstrap automatically switches to Hall-Mammen wild PSU clustering (see the Note on survey + bootstrap below). Under the default auto-inject `psu=group`, PSU coincides with group and the group-level and PSU-level paths are identical (bit-for-bit via the identity-map fast path). The matching test for the `cluster=` gate is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`. - **Note (bootstrap inference surface):** When `n_bootstrap > 0`, the top-level `results.overall_p_value` / `results.overall_conf_int` (and joiners/leavers analogues) hold **percentile-based bootstrap inference** computed by the multiplier bootstrap, NOT normal-theory recomputations from the bootstrap SE. The t-stat (`overall_t_stat`, etc.) is computed from the SE via `safe_inference()[0]` to satisfy the project's anti-pattern rule (never compute `t = effect / se` inline) — bootstrap does not define an alternative t-stat semantic for percentile bootstrap, so the SE-based t-stat is the natural choice. `event_study_effects[1]`, `summary()`, `to_dataframe()`, `is_significant`, and `significance_stars` all read from these top-level fields and therefore reflect the bootstrap inference automatically. The library precedent for this propagation is `imputation.py:790-805`, `two_stage.py:778-787`, and `efficient_did.py:1009-1013`. The single-period placebo (`L_max=None`) still has NaN bootstrap fields; multi-horizon placebos (`L_max >= 1`) have valid bootstrap SE/CI/p via `placebo_horizon_ses/cis/p_values` on the bootstrap results object. The matching test is `test_bootstrap_p_value_and_ci_propagated_to_top_level` in `tests/test_chaisemartin_dhaultfoeuille.py::TestBootstrap`. @@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`. -- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. +- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning. @@ -633,7 +633,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Single-lag placebo `DID_M^pl` (point estimate; SE deferred to Phase 2) - [x] TWFE decomposition diagnostic (Theorem 1 of AER 2020): per-cell weights, fraction negative, `sigma_fe`, `beta_fe` - [x] Standalone `twowayfeweights()` helper for users who only want the TWFE diagnostic -- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group +- [x] Multiplier bootstrap with Rademacher / Mammen / Webb weights, clustered at group by default; automatically upgraded to PSU-level Hall-Mammen wild clustering under `survey_design` with strictly-coarser PSUs - [x] `drop_larger_lower=True` default (matches R `DIDmultiplegtDYN`); `False` opt-in with explicit inconsistency warning - [x] Singleton-baseline filter (footnote 15 of dynamic paper, variance computation only) with explicit warning - [x] Never-switching groups participate in the variance via stable-control roles after the Round 2 full-IF fix; `n_groups_dropped_never_switching` field retained as backwards-compatibility metadata only @@ -648,9 +648,9 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Heterogeneity testing via saturated OLS (Web Appendix Section 1.5, Lemma 7) - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface -- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Replicate weights and PSU-level bootstrap deferred. -- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). -- **Note (survey + bootstrap fallback):** When `survey_design` and `n_bootstrap > 0` are both active, the multiplier bootstrap uses group-level Rademacher/Mammen/Webb weights rather than PSU-level resampling. A `UserWarning` is emitted from `fit()`. This is conservative when groups are finer than PSUs; a PSU-level survey bootstrap is deferred to a future release. For design-based analytical variance, the TSL path (non-bootstrap) is the recommended contract. +- [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. +- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). Under replicate-weight designs, the same group-level IF expansion is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. +- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. --- diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index 1193ce60..081d9e92 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -404,32 +404,35 @@ def test_survey_design_rejects_fweight(self, data): def test_cluster_parameter_raises_not_implemented(self, data): """ - Per Phase 1 cluster contract: dCDH always clusters at the - group level via the cohort-recentered influence function - (analytical SEs) and the multiplier bootstrap (also grouped at - the group column). Custom clustering is not supported in - Phase 1. + Per the dCDH cluster contract: dCDH clusters at the group + level by default via the cohort-recentered influence function + (analytical SEs) and the multiplier bootstrap. Under + ``survey_design`` with strictly-coarser PSUs the bootstrap + automatically upgrades to PSU-level Hall-Mammen wild. User- + specified clustering via the ``cluster=`` kwarg is not + supported. The reviewer flagged that ``cluster`` was previously accepted on ``__init__`` and stored on ``self.cluster`` but never actually read by ``fit()`` or ``_compute_dcdh_bootstrap()``, - making it a silent no-op. This test pins the new contract: any + making it a silent no-op. This test pins the contract: any non-None cluster value raises ``NotImplementedError`` at construction time with a message naming the offending value - and pointing at the Phase 1 reservation. The same gate fires - from ``set_params``. + and pointing at the no-custom-clustering reservation. The + same gate fires from ``set_params``. - See REGISTRY.md ``Note (Phase 1 cluster contract)``. + See REGISTRY.md ``Note (cluster contract)``. """ + pattern = r"cluster.*(not supported|reserved for a future)" # __init__ rejects any non-None cluster - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): ChaisemartinDHaultfoeuille(cluster="state") - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): ChaisemartinDHaultfoeuille(cluster="unit") # set_params after construction also rejects est = ChaisemartinDHaultfoeuille() - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): est.set_params(cluster="state") # cluster=None still works (the only supported value) @@ -439,7 +442,7 @@ def test_cluster_parameter_raises_not_implemented(self, data): # The convenience function also rejects (forward-compat gate # propagates through the wrapper at __init__ time) - with pytest.raises(NotImplementedError, match=r"cluster.*Phase 1"): + with pytest.raises(NotImplementedError, match=pattern): chaisemartin_dhaultfoeuille( data, outcome="outcome", diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index c2c7c0f0..1aa9f14c 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -317,8 +317,16 @@ def test_varying_weights_change_att(self, base_data): result_survey.overall_att, abs=0.01 ) - def test_rejects_replicate_weights(self, base_data): - """Replicate weight variance not yet supported.""" + def test_rejects_replicate_weights_with_bootstrap(self, base_data): + """Replicate weights combined with n_bootstrap > 0 is rejected. + + Replicate variance is closed-form (compute_replicate_if_variance); + combining it with a multiplier bootstrap would double-count + variance. Matches library precedent in efficient_did.py:989, + staggered.py:1869, two_stage.py:251-253. The standalone + replicate-only path (n_bootstrap=0) is supported separately; + see tests/test_survey_dcdh_replicate_psu.py. + """ df = base_data.copy() df["pw"] = 1.0 df["rep1"] = 1.0 @@ -328,8 +336,8 @@ def test_rejects_replicate_weights(self, base_data): replicate_weights=["rep1", "rep2"], replicate_method="BRR", ) - with pytest.raises(NotImplementedError, match="Replicate"): - ChaisemartinDHaultfoeuille().fit( + with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"): + ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit( df, outcome="outcome", group="group", @@ -366,9 +374,31 @@ def test_multi_horizon_survey_runs(self, data_with_survey): class TestBootstrapSurveyWarning: - def test_bootstrap_survey_emits_warning(self, data_with_survey): + def test_bootstrap_survey_auto_inject_no_warning(self, data_with_survey): + """Under auto-inject psu=group, Hall-Mammen wild PSU bootstrap + coincides with the group-level multiplier bootstrap — so no + warning should fire. The old 'PSU-level deferred' warning has + been replaced with a conditional one that only fires when the + user passes a strictly coarser PSU. + + See the new test file tests/test_survey_dcdh_replicate_psu.py for + the strictly-coarser-PSU case where the warning DOES fire. + """ sd = SurveyDesign(weights="pw") - with pytest.warns(UserWarning, match="group-level multiplier"): + import warnings as _w + + with _w.catch_warnings(): + _w.simplefilter("error", UserWarning) + # Ignore warnings unrelated to the bootstrap-PSU contract. + _w.filterwarnings( + "ignore", message="Single-period placebo SE" + ) + _w.filterwarnings( + "ignore", message="pweight weights normalized" + ) + _w.filterwarnings( + "ignore", message="Assumption 11" + ) ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit( data_with_survey, outcome="outcome", @@ -728,9 +758,11 @@ def test_twfe_helper_rejects_non_pweight(self, base_data): survey_design=sd, ) - def test_twfe_helper_rejects_replicate_weights(self, base_data): - """Replicate-weight survey designs must be rejected by the helper, - matching fit()'s NotImplementedError contract.""" + def test_twfe_helper_accepts_replicate_weights(self, base_data): + """Replicate-weight designs are accepted by the helper (no SE field + on TWFEWeightsResult, so only cell aggregation is affected). Matches + fit()'s new contract where replicate variance runs analytically via + compute_replicate_if_variance.""" from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights df_ = base_data.copy() @@ -742,13 +774,15 @@ def test_twfe_helper_rejects_replicate_weights(self, base_data): replicate_weights=["rep1", "rep2"], replicate_method="BRR", ) - with pytest.raises(NotImplementedError, match="Replicate"): - twowayfeweights( - df_, - outcome="outcome", group="group", - time="period", treatment="treatment", - survey_design=sd, - ) + result = twowayfeweights( + df_, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(result.beta_fe) + assert np.isfinite(result.sigma_fe) + assert 0.0 <= result.fraction_negative <= 1.0 # ── Test: TWFE diagnostic oracle under survey ─────────────────────── diff --git a/tests/test_survey_dcdh_replicate_psu.py b/tests/test_survey_dcdh_replicate_psu.py new file mode 100644 index 00000000..595f80bd --- /dev/null +++ b/tests/test_survey_dcdh_replicate_psu.py @@ -0,0 +1,1166 @@ +"""Replicate-weight variance and PSU-level bootstrap tests for dCDH. + +Covers the two survey-variance extensions added after PR #307: +1. Replicate-weight variance (BRR/Fay/JK1/JKn/SDR) via the unified + `compute_replicate_if_variance` helper, routed through the inline + branch in `_survey_se_from_group_if` (Class A sites) and the + parallel inline branch in `_compute_heterogeneity_test` (Class B). +2. PSU-level Hall-Mammen wild bootstrap via `group_to_psu_map` + threaded through `_compute_dcdh_bootstrap`, with an identity-map + fast path preserving auto-inject `psu=group` semantics. +""" + +import warnings + +import numpy as np +import pandas as pd +import pytest + +from diff_diff import ChaisemartinDHaultfoeuille, SurveyDesign +from diff_diff.chaisemartin_dhaultfoeuille import twowayfeweights + + +# ── Fixtures ──────────────────────────────────────────────────────── + + +REPLICATE_METHODS = ["BRR", "Fay", "JK1", "JKn", "SDR"] + + +def _make_reversible_panel( + n_groups: int = 30, + n_periods: int = 5, + seed: int = 42, +) -> pd.DataFrame: + """Simple reversible-treatment panel with two switch cohorts.""" + rng = np.random.default_rng(seed) + rows = [] + for g in range(n_groups): + f = 2 if g < n_groups // 2 else 3 + for t in range(n_periods): + d = 1 if t >= f else 0 + y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3) + rows.append( + { + "group": g, + "period": t, + "treatment": d, + "outcome": y, + "pw": 1.0, + "x_het": float(g) * 0.1, + } + ) + return pd.DataFrame(rows) + + +def _attach_replicate_weights( + df: pd.DataFrame, + R: int, + method: str, + seed: int = 101, +) -> pd.DataFrame: + """Add R replicate-weight columns and return the mutated DataFrame.""" + rng = np.random.default_rng(seed) + df = df.copy() + for r in range(R): + if method == "BRR" or method == "Fay" or method == "SDR": + df[f"rep{r}"] = rng.choice([0.5, 1.5], size=len(df)) + elif method in ("JK1", "JKn"): + df[f"rep{r}"] = rng.uniform(0.5, 1.5, size=len(df)) + return df + + +def _rep_cols(R: int) -> list: + return [f"rep{r}" for r in range(R)] + + +def _build_replicate_design(R: int, method: str) -> SurveyDesign: + """Build a SurveyDesign object with method-specific required params.""" + if method == "Fay": + return SurveyDesign( + weights="pw", + replicate_weights=_rep_cols(R), + replicate_method="Fay", + fay_rho=0.5, + ) + if method == "JKn": + return SurveyDesign( + weights="pw", + replicate_weights=_rep_cols(R), + replicate_method="JKn", + replicate_strata=[r % 2 for r in range(R)], + ) + return SurveyDesign( + weights="pw", + replicate_weights=_rep_cols(R), + replicate_method=method, + ) + + +@pytest.fixture(scope="module") +def base_panel(): + return _make_reversible_panel(n_groups=30, n_periods=5, seed=42) + + +@pytest.fixture +def replicate_design(): + """Helper constructing a SurveyDesign with replicate weights for a given method.""" + + def _build(df: pd.DataFrame, R: int, method: str, seed: int = 101) -> pd.DataFrame: + return _attach_replicate_weights(df, R=R, method=method, seed=seed) + + return _build + + +def _make_strictly_coarser_psu_panel( + n_groups_per_psu: int = 3, + n_psu: int = 4, + n_periods: int = 5, + seed: int = 17, +) -> pd.DataFrame: + """Panel where groups nest strictly inside PSUs. + + 12 groups across 4 PSUs (3 groups/PSU). Two switch cohorts so the + estimator has both joiners at different F_g. + """ + rng = np.random.default_rng(seed) + n_groups = n_groups_per_psu * n_psu + rows = [] + for g in range(n_groups): + psu = g // n_groups_per_psu + f = 2 if g < n_groups // 2 else 3 + for t in range(n_periods): + d = 1 if t >= f else 0 + y = float(g) + 0.5 * t + 1.0 * d + rng.normal(scale=0.3) + rows.append( + { + "group": g, + "period": t, + "treatment": d, + "outcome": y, + "pw": 1.0, + "psu": psu, + } + ) + return pd.DataFrame(rows) + + +# ── 1. Class A replicate (sites via _survey_se_from_group_if) ──────── + + +class TestReplicateClassA: + """Replicate variance wired through the inline branch in + `_survey_se_from_group_if` — inherited by overall DID_M, joiners + DID_+, leavers DID_-, multi-horizon DID_l, placebo DID^pl_l. + """ + + @pytest.mark.parametrize("method", REPLICATE_METHODS) + def test_overall_se_finite(self, base_panel, replicate_design, method): + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_att) + assert np.isfinite(res.overall_se) + assert res.overall_se > 0 + + @pytest.mark.parametrize("method", REPLICATE_METHODS) + def test_inference_fields_finite(self, base_panel, replicate_design, method): + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_t_stat) + assert np.isfinite(res.overall_p_value) + assert np.all(np.isfinite(res.overall_conf_int)) + + @pytest.mark.parametrize("method", ["BRR", "JK1"]) + def test_df_survey_reflects_n_valid(self, base_panel, replicate_design, method): + R = 24 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + # All replicates should succeed on a clean panel → df = n_valid - 1 = R - 1 + assert res.survey_metadata.df_survey == R - 1 + + def test_jk1_converges_to_tsl(self, replicate_design): + """JK1 replicate SE should be in the same order of magnitude as + analytical TSL SE on a large panel. Uses a loose 50% tolerance + because convergence depends on weight-construction details; the + stringent R-parity convergence test lives in + `test_survey_phase6.py`.""" + df = _make_reversible_panel(n_groups=60, n_periods=6, seed=42) + R = 30 + df_rep = replicate_design(df, R=R, method="JK1") + + sd_rep = _build_replicate_design(R, "JK1") + sd_tsl = SurveyDesign(weights="pw") + + res_rep = ChaisemartinDHaultfoeuille(seed=1).fit( + df_rep, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_rep, + ) + res_tsl = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_tsl, + ) + # Same point estimate (both unweighted) + assert res_rep.overall_att == pytest.approx(res_tsl.overall_att, rel=1e-6) + # SEs should be within 50% of each other (broad envelope) + ratio = res_rep.overall_se / res_tsl.overall_se + assert 0.5 <= ratio <= 2.0, ( + f"Replicate SE ({res_rep.overall_se:.4f}) and TSL SE " + f"({res_tsl.overall_se:.4f}) differ by more than 2x." + ) + + @pytest.mark.parametrize("method", ["BRR", "JK1"]) + def test_multi_horizon_under_replicate( + self, replicate_design, method + ): + """Multi-horizon DID_l inherits the Class A dispatch — horizon 1 + (always identifiable on this panel) should produce a finite + replicate-based SE.""" + df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42) + R = 20 + df = replicate_design(df, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + ) + info = res.event_study_effects[1] + assert np.isfinite(info["effect"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + + def test_placebo_under_replicate(self, replicate_design): + """Placebo DID^{pl}_l under replicate weights carries finite SE, + p-value, and CI. Placebo horizons are stored under NEGATIVE + keys in ``placebo_event_study`` (see REGISTRY.md line 546 — + ``-l`` corresponds to backward outcome difference + ``Y_{g, F_g-1-l} - Y_{g, F_g-1}``). Pre-R3-fix this test used + key ``1`` and silently skipped the assertion.""" + df = _make_reversible_panel(n_groups=30, n_periods=6, seed=42) + R = 20 + df = replicate_design(df, R=R, method="BRR") + sd = _build_replicate_design(R, "BRR") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + ) + assert res.placebo_event_study, ( + "Expected placebo_event_study to be populated for L_max=1 " + "reversible panel" + ) + # Negative key convention: placebo horizon l → key -l. + assert -1 in res.placebo_event_study, ( + f"Expected key -1 in placebo_event_study (got keys " + f"{sorted(res.placebo_event_study)})" + ) + info = res.placebo_event_study[-1] + assert info["n_obs"] > 0, "Placebo horizon should have observations" + assert np.isfinite(info["se"]) and info["se"] > 0 + assert np.isfinite(info["p_value"]) + assert np.all(np.isfinite(info["conf_int"])) + + def test_did_x_replicate(self, base_panel, replicate_design): + """DID^X covariate adjustment flows through the Class A dispatch + (Y_mat residualization happens upstream of IF).""" + df = base_panel.copy() + rng = np.random.default_rng(123) + df["cov1"] = rng.normal(size=len(df)) + R = 20 + df = replicate_design(df, R=R, method="JK1") + sd = _build_replicate_design(R, "JK1") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + controls=["cov1"], + L_max=1, + ) + assert np.isfinite(res.overall_att) + assert np.isfinite(res.overall_se) and res.overall_se > 0 + + +# ── 2. Class B replicate (heterogeneity + twowayfeweights) ────────── + + +class TestReplicateClassB: + """Sites that call `compute_survey_if_variance` directly (or don't + route through the variance path at all) and need their own + replicate handling.""" + + @pytest.mark.parametrize("method", REPLICATE_METHODS) + def test_heterogeneity_se_finite(self, base_panel, replicate_design, method): + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + heterogeneity="x_het", + ) + assert res.heterogeneity_effects is not None + info = res.heterogeneity_effects[1] + assert np.isfinite(info["beta"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + + def test_twowayfeweights_accepts_replicate(self, base_panel, replicate_design): + """TWFE diagnostic produces the same beta_fe / sigma_fe / + fraction_negative under a replicate design as under a + pweight-only design (replicate weights only affect + `resolved.weights` cell aggregation).""" + R = 20 + df = replicate_design(base_panel, R=R, method="BRR") + sd_rep = _build_replicate_design(R, "BRR") + sd_plain = SurveyDesign(weights="pw") + + res_rep = twowayfeweights( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_rep, + ) + res_plain = twowayfeweights( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_plain, + ) + # Cell aggregation uses resolved.weights; replicate_weights are + # present but don't change the aggregated diagnostics. + assert res_rep.beta_fe == pytest.approx(res_plain.beta_fe, rel=1e-10) + assert res_rep.sigma_fe == pytest.approx(res_plain.sigma_fe, rel=1e-10) + assert res_rep.fraction_negative == pytest.approx( + res_plain.fraction_negative, rel=1e-10 + ) + + +# ── 3. PSU-level Hall-Mammen wild bootstrap ───────────────────────── + + +class TestPSUBootstrap: + + def test_auto_inject_bit_identical_to_group_level(self, base_panel): + """Under auto-inject psu=group the Hall-Mammen PSU bootstrap + collapses to the group-level multiplier bootstrap via the + identity-map fast path — producing bit-identical SE.""" + df = base_panel.copy() + sd_auto = SurveyDesign(weights="pw") + sd_explicit = SurveyDesign(weights="pw", psu="group") + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + r_auto = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_auto, + ) + r_explicit = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd_explicit, + ) + assert r_auto.overall_se == pytest.approx(r_explicit.overall_se, rel=1e-10) + + def test_coarser_psu_produces_finite_se(self): + """Under a strictly coarser PSU, the Hall-Mammen PSU bootstrap + still produces a finite SE (the value depends on within-PSU + IF correlation).""" + df = _make_strictly_coarser_psu_panel() + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_att) + assert np.isfinite(res.overall_se) and res.overall_se > 0 + + def test_no_warning_under_auto_inject(self, base_panel): + """No UserWarning about PSU-level bootstrap should fire under + auto-inject psu=group (the warning only fires for strictly + coarser PSU).""" + sd = SurveyDesign(weights="pw") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit( + base_panel, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert not any( + "Hall-Mammen wild" in str(w.message) + or "group-level multiplier" in str(w.message) + for w in caught + ), ( + "Bootstrap-PSU warning should not fire under auto-inject psu=group, " + f"but got: {[str(w.message) for w in caught]}" + ) + + def test_warning_under_coarser_psu(self): + """When PSU is strictly coarser than group, the Hall-Mammen + warning fires.""" + df = _make_strictly_coarser_psu_panel() + sd = SurveyDesign(weights="pw", psu="psu") + with pytest.warns(UserWarning, match="Hall-Mammen wild"): + ChaisemartinDHaultfoeuille(n_bootstrap=100, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + @pytest.mark.parametrize("weight_type", ["rademacher", "mammen", "webb"]) + def test_all_weight_types_under_psu(self, weight_type): + """Rademacher / Mammen / Webb multipliers should all produce + finite SE under the PSU-level path.""" + df = _make_strictly_coarser_psu_panel() + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille( + n_bootstrap=200, seed=1, bootstrap_weights=weight_type + ).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + assert np.isfinite(res.overall_se) and res.overall_se > 0 + + def test_multi_horizon_psu_bootstrap(self): + """Event study bootstrap inherits PSU broadcasting.""" + df = _make_strictly_coarser_psu_panel(n_periods=6) + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=300, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + ) + info = res.event_study_effects[1] + assert np.isfinite(info["effect"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + + def test_multi_horizon_shared_draw_under_coarser_psu(self): + """End-to-end test of the shared-draw multi-horizon bootstrap + under a strictly coarser PSU with L_max >= 2. Exercises the + per-horizon invariant assertion + sup-t band computation + through the Hall-Mammen wild PSU path.""" + df = _make_strictly_coarser_psu_panel(n_periods=7) + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=2, + ) + # Both horizons should produce finite bootstrap SEs via the + # shared-draw path projected through the PSU map. + for lag in (1, 2): + info = res.event_study_effects[lag] + if info["n_obs"] > 0: + assert np.isfinite(info["effect"]) + assert np.isfinite(info["se"]) and info["se"] > 0 + + def test_sup_t_under_coarser_psu(self): + """Sup-t critical value is computable under coarser PSU.""" + df = _make_strictly_coarser_psu_panel(n_periods=6) + sd = SurveyDesign(weights="pw", psu="psu") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(n_bootstrap=500, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=2, + ) + # sup_t_bands may be None if only one horizon is valid or if + # the cband critical value was not computed. + if res.sup_t_bands is not None: + assert res.sup_t_bands.get("cband_crit_value") is None or ( + res.sup_t_bands["cband_crit_value"] > 1.0 + ) + + +# ── 4. Invariants and cross-cutting contracts ──────────────────────── + + +class TestInvariants: + + def test_replicate_plus_bootstrap_rejected(self, base_panel, replicate_design): + """Replicate variance + n_bootstrap > 0 raises NotImplementedError.""" + R = 10 + df = replicate_design(base_panel, R=R, method="BRR") + sd = _build_replicate_design(R, "BRR") + with pytest.raises(NotImplementedError, match="replicate weights and n_bootstrap"): + ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + ) + + def test_non_survey_unchanged(self, base_panel): + """Non-survey fits (survey_design=None) should produce bit- + identical SE to the pre-PSU-bootstrap behavior.""" + r1 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + base_panel, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + ) + r2 = ChaisemartinDHaultfoeuille(n_bootstrap=200, seed=1).fit( + base_panel, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + ) + assert r1.overall_se == pytest.approx(r2.overall_se, rel=1e-10) + + def test_map_for_target_id_lookup(self): + """`_map_for_target` builds the PSU map from group IDs via dict + lookup, not positional reuse. All current dCDH bootstrap + targets use the variance-eligible group ordering, so the + helper looks up each target group's PSU via + `group_id_to_psu_code`. Length mismatch → ValueError (loud + failure rather than silent miscluster).""" + from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import ( + _map_for_target, + ) + + gid_to_psu = {"a": 0, "b": 0, "c": 1, "d": 1, "e": 2} + eligible = np.asarray(["a", "b", "c", "d", "e"]) + expected = np.array([0, 0, 1, 1, 2], dtype=np.int64) + built = _map_for_target(5, gid_to_psu, eligible) + assert built is not None + assert np.array_equal(built, expected) + + # None passthrough when no PSU info + assert _map_for_target(5, None, None) is None + + # Length mismatch → loud failure + with pytest.raises(ValueError, match="target size"): + _map_for_target(3, gid_to_psu, eligible) + + # Missing group ID → loud failure + gid_to_psu_incomplete = {"a": 0, "b": 0, "c": 1, "d": 1} + with pytest.raises(ValueError, match="has no entry"): + _map_for_target(5, gid_to_psu_incomplete, eligible) + + # Non-prefix reordering: different ordered group IDs produce a + # different PSU map even if the dict is the same. This is the + # key invariant the previous prefix-slicing lacked. + eligible_reordered = np.asarray(["c", "a", "d", "e", "b"]) + built_reordered = _map_for_target(5, gid_to_psu, eligible_reordered) + assert built_reordered is not None + expected_reordered = np.array([1, 0, 1, 2, 0], dtype=np.int64) + assert np.array_equal(built_reordered, expected_reordered) + + def test_generate_psu_or_group_weights_broadcast(self): + """Direct unit test of the PSU-level weight generator: + groups mapped to the same PSU receive the same multiplier + within a single bootstrap replicate (Hall-Mammen wild PSU + contract).""" + from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import ( + _generate_psu_or_group_weights, + ) + + # 6 groups → 3 PSUs: groups (0,1), (2,3), (4,5) share multipliers. + group_to_psu = np.array([0, 0, 1, 1, 2, 2], dtype=np.int64) + rng = np.random.default_rng(0) + W = _generate_psu_or_group_weights( + n_bootstrap=50, + n_groups_target=6, + weight_type="rademacher", + rng=rng, + group_to_psu_map=group_to_psu, + ) + assert W.shape == (50, 6) + # Groups in the same PSU must receive the same multiplier + # within each bootstrap replicate. + assert np.array_equal(W[:, 0], W[:, 1]) + assert np.array_equal(W[:, 2], W[:, 3]) + assert np.array_equal(W[:, 4], W[:, 5]) + # Different PSUs should NOT all produce identical columns + # (given 50 replicates and Rademacher weights, collision + # probability is 2^-49 → effectively 0). + assert not np.array_equal(W[:, 0], W[:, 2]) + + def test_generate_psu_or_group_weights_identity(self): + """Identity map (each group its own PSU) uses the fast path and + produces group-level weights bit-identical to the non-PSU path.""" + from diff_diff.chaisemartin_dhaultfoeuille_bootstrap import ( + _generate_psu_or_group_weights, + ) + + identity_map = np.arange(6, dtype=np.int64) + rng1 = np.random.default_rng(0) + W_identity = _generate_psu_or_group_weights( + n_bootstrap=20, + n_groups_target=6, + weight_type="mammen", + rng=rng1, + group_to_psu_map=identity_map, + ) + rng2 = np.random.default_rng(0) + W_plain = _generate_psu_or_group_weights( + n_bootstrap=20, + n_groups_target=6, + weight_type="mammen", + rng=rng2, + group_to_psu_map=None, + ) + assert np.array_equal(W_identity, W_plain) + + @pytest.mark.parametrize("method", ["BRR", "JK1"]) + def test_honest_did_under_replicate( + self, base_panel, replicate_design, method + ): + """HonestDiD bounds should flow through replicate SE and the + reduced df_survey (df = min(n_valid) - 1).""" + R = 20 + df = replicate_design(base_panel, R=R, method=method) + sd = _build_replicate_design(R, method) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + survey_design=sd, + L_max=1, + honest_did=True, + ) + assert res.survey_metadata.df_survey == R - 1 + if res.honest_did_results is not None: + # HonestDiD bounds exist and are finite; exact structure is + # honest_did.py internals. Just verify the attribute is + # populated so the SE flow-through is exercised. + assert res.honest_did_results is not None + + def test_rank_deficient_replicate_uses_design_df(self, base_panel): + """Duplicated replicate columns → QR-rank < R → design df < R-1. + + Regression for PR #311 CI review R1 P1. The main dCDH surface, + ``survey_metadata.df_survey``, and HonestDiD must all use the + reduced effective df (``min(design_df, n_valid - 1)``) rather + than the naive ``n_valid - 1`` that ignored the design's QR + rank. Before the fix, ``survey_metadata.df_survey`` stayed at + ``R - 1`` for rank-deficient replicate designs — which is + anti-conservative for t-inference. + """ + R = 12 + df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3) + # Force duplicate columns to induce rank deficiency: + # rep1 = rep0, rep3 = rep2 → true rank = R - 2. + df["rep1"] = df["rep0"] + df["rep3"] = df["rep2"] + sd = _build_replicate_design(R, "BRR") + # Read off the design-level df via the resolved design so the + # test is robust to internal QR tolerance changes. + resolved = sd.resolve(df) + design_df = resolved.df_survey + assert design_df is not None and design_df < R - 1, ( + f"Expected rank deficiency: design df={design_df} " + f"should be < {R - 1}" + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, honest_did=True, + ) + # survey_metadata carries the final (reduced) df — not R - 1. + assert res.survey_metadata.df_survey == design_df, ( + f"Expected survey_metadata.df_survey == design_df " + f"({design_df}), got {res.survey_metadata.df_survey}." + ) + # HonestDiD reads results.survey_metadata.df_survey directly + # (honest_did.py:973), so the same reduced df flows through. + # We assert the HonestDiD result is populated (df flow-through + # is exercised by the honest_did=True call above). + assert res.honest_did_results is not None + + def test_dropped_replicate_reduces_df(self, base_panel): + """When some replicate columns produce degenerate per-replicate + estimates (e.g., all-zero weight vectors), the design's QR rank + drops below R — which flows through to the effective df. + + Regression for PR #311 CI review R1 P2. Even when every IF site + reports the full ``n_valid = R``, the persisted df must never + exceed the design-level df. Pre-fix, ``_effective_df_survey`` + returned ``n_valid - 1 = R - 1`` which over-counted degrees of + freedom on a reduced-rank design. + """ + R = 15 + df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=4) + # Zero out two replicate columns → drops rank by 2. + df["rep0"] = 0.0 + df["rep1"] = 0.0 + sd = _build_replicate_design(R, "JK1") + resolved = sd.resolve(df) + design_df = resolved.df_survey + assert design_df is not None and design_df <= R - 3, ( + f"Expected zero-column rank deficiency: design df={design_df} " + f"should be <= {R - 3}" + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + # Persisted df must be capped by the design df, not R - 1. + assert res.survey_metadata.df_survey is not None + assert res.survey_metadata.df_survey <= design_df, ( + f"Expected survey_metadata.df_survey <= design_df " + f"({design_df}), got {res.survey_metadata.df_survey}. " + f"Effective df must NEVER exceed the design-level df under " + f"replicate variance." + ) + assert res.survey_metadata.df_survey < R - 1, ( + f"Expected persisted df to reflect reduced rank (< R - 1 = " + f"{R - 1}), got {res.survey_metadata.df_survey}." + ) + + def test_rank_1_replicate_forces_nan_inference(self, base_panel): + """All replicate columns identical → QR-rank = 1 → design df + undefined (None). Even if per-IF replicate SEs come back + finite, every public inference field (t_stat / p_value / + conf_int) MUST be NaN — otherwise users get z-based inference + silently instead of the NaN contract documented in REGISTRY.md. + + R2 P0 regression: before the fix, ``_effective_df_survey`` + returned None, ``safe_inference(df=None)`` computed z-based + statistics, and a rank-1 replicate design with finite SEs + would produce valid-looking (but wrong) p-values/CIs. + """ + R = 8 + df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=3) + # Make every replicate column identical → QR-rank = 1. + rep_col_shared = df["rep0"].copy() + for r in range(R): + df[f"rep{r}"] = rep_col_shared + sd = _build_replicate_design(R, "BRR") + resolved = sd.resolve(df) + assert resolved.df_survey is None, ( + "Rank-1 replicate matrix must have undefined design df" + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + # survey_metadata.df_survey stays None — the honest undefined + # signal for downstream consumers. + assert res.survey_metadata.df_survey is None + # Main dCDH inference fields are NaN via the _inference_df + # coercion (None → 0 under replicate → safe_inference NaN + # branch). + assert np.isnan(res.overall_t_stat) + assert np.isnan(res.overall_p_value) + assert not np.all(np.isfinite(res.overall_conf_int)) + # Event study horizon 1 is also NaN. + if 1 in res.event_study_effects: + info = res.event_study_effects[1] + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + + def test_heterogeneity_replicate_cross_surface_df_consistency( + self, base_panel, + ): + """With ``heterogeneity=`` active under a rank-deficient + replicate design, every public surface — top-level dCDH, + event study, heterogeneity, ``survey_metadata.df_survey``, + and HonestDiD — MUST use the SAME final effective df. + + R2 P1 regression: before the fix, the main surfaces + (``overall_*``, ``event_study_effects``) computed t/p/CI + with an intermediate ``_df_survey`` that did not include + heterogeneity's ``n_valid_het``. Then + ``survey_metadata.df_survey`` was overwritten with the + post-heterogeneity min — so HonestDiD used a different df + than the main dCDH surface. + """ + from scipy import stats as _stats + + R = 12 + df = _attach_replicate_weights(base_panel, R=R, method="BRR", seed=7) + # Rank deficiency: 2 duplicated pairs → rank = R - 2 = 10. + df["rep1"] = df["rep0"] + df["rep3"] = df["rep2"] + sd = _build_replicate_design(R, "BRR") + resolved = sd.resolve(df) + design_df = resolved.df_survey + assert design_df is not None and design_df < R - 1 + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + heterogeneity="x_het", + honest_did=True, + ) + final_df = res.survey_metadata.df_survey + # All sites report n_valid == R → effective df = min(design_df, R-1) + # = design_df. + assert final_df == design_df + # Verify overall inference was computed with df=final_df by + # checking p-value against t(final_df) — would disagree if + # df=R-1 or df=None was used. + t_stat = res.overall_att / res.overall_se + expected_p = 2 * _stats.t.sf(abs(t_stat), df=final_df) + assert res.overall_p_value == pytest.approx(expected_p, rel=1e-6) + # Same check for heterogeneity surface. + if res.heterogeneity_effects: + het_info = res.heterogeneity_effects[1] + if np.isfinite(het_info["se"]): + het_t = het_info["beta"] / het_info["se"] + expected_het_p = 2 * _stats.t.sf(abs(het_t), df=final_df) + assert het_info["p_value"] == pytest.approx( + expected_het_p, rel=1e-6 + ) + # HonestDiD bounds were computed from the same survey_metadata; + # asserting the result attribute is populated confirms the df + # flow-through was exercised. + assert res.honest_did_results is not None + + def test_heterogeneity_late_nvalid_propagates_to_all_surfaces( + self, base_panel, monkeypatch, + ): + """Regression for PR #311 CI review R4 P2. + + The duplicated-column test above exercises the design-df cap + (which binds BEFORE heterogeneity runs), so it cannot catch a + bug where the post-heterogeneity recompute block fails to + propagate the final df to every public surface. This test + isolates the late-arriving case by monkeypatching + `compute_replicate_if_variance` to return a SMALLER n_valid + only on the heterogeneity call site — after main surfaces + have already frozen their t/p/CI fields with the larger df. + + Every public surface (`overall_*`, `event_study_effects`, + `placebo_event_study`, `heterogeneity_effects`, + `survey_metadata.df_survey`) must reflect the reduced df. + """ + from scipy import stats as _stats + from diff_diff import survey as _survey_mod + + R = 20 + reduced_n_valid = 7 # small enough that (reduced - 1) < R - 1 + df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5) + sd = _build_replicate_design(R, "JK1") + resolved = sd.resolve(df) + # Clean (full-rank) design — design df is NOT the binding cap. + assert resolved.df_survey == R - 1 + + original = _survey_mod.compute_replicate_if_variance + + # Pass 1: count the main-surface calls to + # `compute_replicate_if_variance` (L_max=1, no heterogeneity). + main_counter = {"n": 0} + + def count_only(psi, resolved_arg): + main_counter["n"] += 1 + return original(psi, resolved_arg) + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", count_only + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + main_call_count = main_counter["n"] + monkeypatch.undo() + + # Pass 2: reduce n_valid on calls STRICTLY AFTER the main + # surfaces (i.e., heterogeneity's call). + tracker = {"count": 0} + + def reduce_after_main(psi, resolved_arg): + tracker["count"] += 1 + var, n_valid = original(psi, resolved_arg) + if tracker["count"] > main_call_count: + return var, reduced_n_valid + return var, n_valid + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", + reduce_after_main, + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + heterogeneity="x_het", + ) + + # Heterogeneity's smaller n_valid should be the binding final + # df: min(R - 1, reduced_n_valid - 1) = reduced_n_valid - 1. + expected_df = reduced_n_valid - 1 + assert res.survey_metadata.df_survey == expected_df, ( + f"survey_metadata.df_survey should be {expected_df} " + f"(reduced_n_valid - 1), got {res.survey_metadata.df_survey}" + ) + + # Overall surface: p-value must use the reduced df. + t_stat = res.overall_att / res.overall_se + expected_p_overall = 2 * _stats.t.sf(abs(t_stat), df=expected_df) + assert res.overall_p_value == pytest.approx( + expected_p_overall, rel=1e-6, + ), ( + "overall_p_value should reflect heterogeneity's late " + "n_valid via the post-recompute block" + ) + + # Event study (L=1) surface. + if res.event_study_effects and 1 in res.event_study_effects: + info = res.event_study_effects[1] + if np.isfinite(info["se"]) and info["se"] > 0: + t_l = info["effect"] / info["se"] + expected_p_l = 2 * _stats.t.sf(abs(t_l), df=expected_df) + assert info["p_value"] == pytest.approx( + expected_p_l, rel=1e-6, + ) + + # Placebo event study surface (NEGATIVE keys). This is the + # one the copied-value bug would hide: `placebo_event_study` + # holds a VALUE copy of `placebo_horizon_inference`, so a + # broken recompute block would leave stale t/p/CI here. + if res.placebo_event_study: + for _lag_neg, pl_info in res.placebo_event_study.items(): + if np.isfinite(pl_info["se"]) and pl_info["se"] > 0: + t_pl = pl_info["effect"] / pl_info["se"] + expected_p_pl = 2 * _stats.t.sf( + abs(t_pl), df=expected_df, + ) + assert pl_info["p_value"] == pytest.approx( + expected_p_pl, rel=1e-6, + ), ( + f"placebo_event_study[{_lag_neg}] p-value " + f"must reflect the reduced df " + f"({expected_df}); got " + f"{pl_info['p_value']} vs expected " + f"{expected_p_pl}" + ) + + # Heterogeneity surface. + if res.heterogeneity_effects: + het_info = res.heterogeneity_effects[1] + if np.isfinite(het_info["se"]) and het_info["se"] > 0: + het_t = het_info["beta"] / het_info["se"] + expected_het_p = 2 * _stats.t.sf( + abs(het_t), df=expected_df, + ) + assert het_info["p_value"] == pytest.approx( + expected_het_p, rel=1e-6, + ) + + # Normalized effects surface. This is a DERIVED dict built + # pre-heterogeneity (delta-method SE from per-horizon SE) — + # same class of "copy" bug as placebo_event_study. R4 P0 + # regression: without the recompute, normalized_effects would + # silently ship p_value/conf_int computed with the + # pre-heterogeneity df. + if res.normalized_effects: + for _lag, norm_info in res.normalized_effects.items(): + if np.isfinite(norm_info["se"]) and norm_info["se"] > 0: + norm_t = norm_info["effect"] / norm_info["se"] + expected_norm_p = 2 * _stats.t.sf( + abs(norm_t), df=expected_df, + ) + assert norm_info["p_value"] == pytest.approx( + expected_norm_p, rel=1e-6, + ), ( + f"normalized_effects[{_lag}] p-value must " + f"reflect the reduced df ({expected_df}); got " + f"{norm_info['p_value']} vs expected " + f"{expected_norm_p}" + ) + + def test_late_nvalid_below_two_forces_all_surfaces_nan( + self, base_panel, monkeypatch, + ): + """Regression for R4 P2 follow-up: when heterogeneity's late + replicate failures drive the final `n_valid` to 1 (reduced df + = 0), every public inference surface must flip to NaN — not + silently keep the finite values that were computed with a + larger intermediate df. + + `_inference_df` coerces df=0 under replicate → safe_inference + NaN branch. Exercises the all-NaN contract end to end. + """ + from diff_diff import survey as _survey_mod + + R = 15 + df = _attach_replicate_weights(base_panel, R=R, method="JK1", seed=5) + sd = _build_replicate_design(R, "JK1") + resolved = sd.resolve(df) + assert resolved.df_survey == R - 1 + + original = _survey_mod.compute_replicate_if_variance + + # Count main-only calls. + main_counter = {"n": 0} + + def count_only(psi, resolved_arg): + main_counter["n"] += 1 + return original(psi, resolved_arg) + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", count_only + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + main_call_count = main_counter["n"] + monkeypatch.undo() + + # Return n_valid=1 on heterogeneity's call → reduced df = 0 + # → _inference_df coerces to NaN. + tracker = {"count": 0} + + def reduce_to_one(psi, resolved_arg): + tracker["count"] += 1 + var, n_valid = original(psi, resolved_arg) + if tracker["count"] > main_call_count: + return var, 1 + return var, n_valid + + monkeypatch.setattr( + _survey_mod, "compute_replicate_if_variance", reduce_to_one + ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + heterogeneity="x_het", + ) + # Final effective df is None (reduced_df = 0 < 1 guard). + assert res.survey_metadata.df_survey is None + # Every public inference surface must be NaN. + assert np.isnan(res.overall_t_stat) + assert np.isnan(res.overall_p_value) + assert not np.all(np.isfinite(res.overall_conf_int)) + if res.event_study_effects: + for info in res.event_study_effects.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + if res.placebo_event_study: + for info in res.placebo_event_study.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + if res.heterogeneity_effects: + for info in res.heterogeneity_effects.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"]) + if res.normalized_effects: + for info in res.normalized_effects.values(): + assert np.isnan(info["t_stat"]) + assert np.isnan(info["p_value"])