diff --git a/CHANGELOG.md b/CHANGELOG.md index ca42006f..a84732c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Add Zenodo DOI badge to README; upgrade the BibTeX citation block with the concept DOI (`10.5281/zenodo.19646175`) and list author as Isaac Gerber (matching `CITATION.cff`). Add `doi:` and `identifiers:` entries (concept + versioned) to `CITATION.cff`. DOI was minted by Zenodo when v3.1.3 was released. +- **`ChaisemartinDHaultfoeuille` heterogeneity + within-group-varying PSU/strata now supported under Binder TSL** - `fit(heterogeneity=..., survey_design=...)` no longer raises `NotImplementedError` when the resolved design's PSU or strata vary across the cells of a group. On the **Binder TSL** branch (`compute_survey_if_variance`), the heterogeneity WLS coefficient IF is expanded to observation level via the cell-period allocator `ψ_i = ψ_g * (w_i / W_{g, out_idx})` on the post-period cell — the DID_l post-period single-cell convention shipped in v3.1.x. Under PSU=group the PSU-level Binder TSL variance is byte-identical to the previous release (PSU-level aggregate telescopes to `ψ_g`); under within-group-varying PSU, mass lands in the post-period PSU of the transition. The **Rao-Wu replicate-weight** branch (`compute_replicate_if_variance`) retains the legacy group-level allocator `ψ_i = ψ_g * (w_i / W_g)`: replicate variance computes `θ_r = sum_i ratio_ir * ψ_i` at observation level and is therefore not PSU-telescoping, so the cell-period allocator would silently change the replicate SE whenever a replicate column's ratios vary within group (e.g., per-row replicate matrices). Replicate + heterogeneity fits therefore produce byte-identical SE to the previous release, and the newly-unblocked `heterogeneity=` + within-group-varying PSU combination is unreachable under replicate designs by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). `n_bootstrap > 0` combined with within-group-varying PSU remains gated with `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map and will be extended in a follow-up PR. ## [3.1.3] - 2026-04-18 diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index a8562a33..846c9506 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -665,10 +665,7 @@ def fit( clustered bootstrap. **Out-of-scope combinations raise ``NotImplementedError``**: (a) replicate weights with ``n_bootstrap > 0`` (replicate variance is closed-form; - bootstrap would double-count variance); (b) - ``heterogeneity=`` with PSU/strata that vary within group - (heterogeneity WLS still uses the legacy group-level IF - expansion; follow-up PR extends it); (c) ``n_bootstrap > + bootstrap would double-count variance); (b) ``n_bootstrap > 0`` with PSU that varies within group (PSU-level bootstrap still uses the legacy group-level PSU map; follow-up PR extends it). See REGISTRY.md @@ -828,39 +825,25 @@ def fit( # Cell-period IF allocator contract: strata and PSU must be # constant within each (g, t) cell, a strict relaxation of - # the previous within-group constancy rule. Two out-of-scope - # combinations are gated with NotImplementedError until the - # corresponding follow-up PRs extend them: - # - heterogeneity= + within-group-varying PSU/strata - # (PR 3: cell-period allocator for the WLS psi_obs) + # the previous within-group constancy rule. One out-of-scope + # combination remains gated with NotImplementedError until + # the corresponding follow-up PR extends it: # - n_bootstrap > 0 + within-group-varying PSU # (PR 4: cell-level Hall-Mammen wild bootstrap) - strata_varies, psu_varies = _strata_psu_vary_within_group( + _, psu_varies = _strata_psu_vary_within_group( resolved_survey, data, group, survey_weights, ) - if strata_varies or psu_varies: - if heterogeneity is not None: - raise NotImplementedError( - "heterogeneity= is not supported under a survey " - "design whose PSU or strata vary within group. " - "The heterogeneity WLS path uses the legacy " - "group-level IF expansion and will be extended " - "to the cell-period allocator in a follow-up " - "PR. For now, either (a) set heterogeneity=None, " - "or (b) collapse PSU/strata to be constant " - "within each group." - ) - if self.n_bootstrap > 0: - raise NotImplementedError( - "n_bootstrap > 0 is not supported under a " - "survey design whose PSU varies within group. " - "The PSU-level Hall-Mammen wild bootstrap uses " - "the legacy group-level PSU map and will be " - "extended to cell-level PSU in a follow-up PR. " - "For now, use n_bootstrap=0 (analytical TSL " - "variance, which fully supports within-group-" - "varying PSU via the cell-period allocator)." - ) + if psu_varies and self.n_bootstrap > 0: + raise NotImplementedError( + "n_bootstrap > 0 is not supported under a " + "survey design whose PSU varies within group. " + "The PSU-level Hall-Mammen wild bootstrap uses " + "the legacy group-level PSU map and will be " + "extended to cell-level PSU in a follow-up PR. " + "For now, use n_bootstrap=0 (analytical TSL " + "variance, which fully supports within-group-" + "varying PSU via the cell-period allocator)." + ) _validate_cell_constant_strata_psu( resolved_survey, data, group, time, survey_weights, ) @@ -3734,15 +3717,45 @@ def _compute_heterogeneity_test( Required when ``obs_survey_info`` is supplied. obs_survey_info : dict, optional Observation-level survey info with keys ``group_ids`` (raw per-row - group labels), ``weights`` (per-row survey weights), and ``resolved`` - (ResolvedSurveyDesign). When provided, the regression uses WLS with - per-group weights W_g = sum of obs survey weights. SE is computed - via Binder TSL IF expansion through ``compute_survey_if_variance`` - by default; under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), - dispatches to ``compute_replicate_if_variance`` for Rao-Wu-style - variance. The effective df for t-critical values follows the - site-level ``min(df_s, n_valid_het - 1)`` rule and the helper - mutates ``replicate_n_valid_list`` so the final + group labels), ``time_ids`` (raw per-row period labels), + ``weights`` (per-row survey weights), ``resolved`` + (ResolvedSurveyDesign), and ``periods`` (sorted canonical period + array matching ``Y_mat``'s column order). When provided, the + regression uses WLS with per-group weights + ``W_g = sum of obs survey weights in group g``. The group-level + WLS coefficient IF is + ``ψ_g = inv(X'WX)[1,:] @ x_g * W_g * r_g``. Two observation-level + expansions of ``ψ_g`` coexist on this path, split by variance + helper so each path uses the allocator that preserves + byte-identity for its aggregation rule: + + * **Binder TSL** (``compute_survey_if_variance``): the + cell-period single-cell allocator — + ``ψ_i = ψ_g * (w_i / W_{g, out_idx})`` for obs in + ``(g, out_idx)``, zero elsewhere. Under PSU=group per-obs + distribution differs from the legacy + ``ψ_i = ψ_g * (w_i / W_g)`` but PSU-level aggregates + telescope to the same ``ψ_g``, so Binder variance is + byte-identical to the pre-cell-period release. Under + within-group-varying PSU mass lands in the post-period PSU + of the transition (DID_l post-period convention). + * **Rao-Wu replicate** (``compute_replicate_if_variance``): + the legacy group-level allocator ``ψ_i = ψ_g * (w_i / W_g)``. + Replicate variance computes ``θ_r = sum_i ratio_ir * ψ_i`` + at observation level, so moving ψ_g mass onto the + post-period cell would silently change the replicate SE + whenever a replicate column's ratios vary within a group + (which the library allows — e.g., per-row BRR/Fay/SDR + matrices). Keeping the legacy allocator on this branch + preserves byte-identity of replicate SE across every + previously-supported fit. Replicate + within-group-varying + PSU is unreachable by construction (``SurveyDesign`` + rejects ``replicate_weights`` combined with explicit + ``strata/psu/fpc``). + + The effective df for t-critical values follows the site-level + ``min(df_s, n_valid_het - 1)`` rule and the helper mutates + ``replicate_n_valid_list`` so the final ``_effective_df_survey(...)`` sees this site's n_valid. replicate_n_valid_list : list[int], optional Shared accumulator for replicate-weight ``n_valid`` counts across @@ -3931,25 +3944,46 @@ def _compute_heterogeneity_test( XtWX_inv = np.linalg.pinv(XtWX) psi_g = (XtWX_inv[1, :] @ design.T) * W_elig * r_g # (n_eligible,) - # Expand to obs level: ψ_i = ψ_g * (w_i / W_g) for i in group g. - psi_obs = np.zeros(len(obs_w_raw)) - for e_idx, g_idx in enumerate(eligible): - gid = gid_list[g_idx] - mask_g = (obs_gids_raw == gid) & valid - w_sum_g = obs_w_raw[mask_g].sum() - if w_sum_g > 0: - psi_obs[mask_g] = psi_g[e_idx] * ( - obs_w_raw[mask_g] / w_sum_g - ) - - # Dispatch: replicate-weight variance (BRR/Fay/JK1/JKn/SDR) - # vs Binder TSL across stratified PSUs. Mirrors the inline - # branch in _survey_se_from_group_if and the pattern in - # TripleDifference:1206-1238. Heterogeneity uses WLS with - # full-sample weights; theta_hat is treated as fixed per the - # FWL plug-in IF convention (REGISTRY.md Note on heterogeneity - # under replicate — no per-replicate refits). + # Allocator dispatch. Two observation-level expansions of + # ψ_g coexist on this path, split by variance helper: + # + # * Binder TSL (compute_survey_if_variance): cell-period + # single-cell allocator — + # ψ_i = ψ_g * (w_i / W_{g, out_idx}) + # for obs in (g, out_idx), zero elsewhere. Under + # PSU=group, per-obs distribution differs from the + # legacy ψ_i = ψ_g * (w_i / W_g) but PSU-level + # aggregates telescope to ψ_g, so Binder variance is + # byte-identical. Under within-group-varying PSU, mass + # lands in the post-period PSU of the transition, which + # is what Binder needs. DID_l single-cell convention — + # see REGISTRY.md ChaisemartinDHaultfoeuille survey IF + # expansion Note. + # + # * Rao-Wu replicate (compute_replicate_if_variance): + # legacy group-level allocator — + # ψ_i = ψ_g * (w_i / W_g) + # for obs in group g. Replicate variance computes + # θ_r = sum_i ratio_ir * ψ_i at observation level, so + # moving ψ_g onto the post-period cell only would + # silently change the replicate SE whenever a + # replicate column's ratios vary within group (e.g., + # the per-row replicate matrices this library + # accepts). The group-level allocator preserves + # byte-identity for all replicate usages under + # PSU=group. The replicate + within-group-varying + # PSU case is not reachable (SurveyDesign rejects + # replicate_weights combined with explicit psu). if getattr(resolved, "uses_replicate_variance", False): + psi_obs = np.zeros(len(obs_w_raw), dtype=np.float64) + for e_idx, g_idx in enumerate(eligible): + gid = gid_list[g_idx] + mask_g = (obs_gids_raw == gid) & valid + w_sum_g = obs_w_raw[mask_g].sum() + if w_sum_g > 0: + psi_obs[mask_g] = psi_g[e_idx] * ( + obs_w_raw[mask_g] / w_sum_g + ) var_s, n_valid_het = compute_replicate_if_variance( psi_obs, resolved ) @@ -3968,6 +4002,23 @@ def _compute_heterogeneity_test( else: df_s_local = min(int(df_s), int(n_valid_het) - 1) else: + obs_tids = np.asarray(obs_survey_info["time_ids"]) + periods_arr = np.asarray(obs_survey_info["periods"]) + psi_obs = np.zeros(len(obs_w_raw), dtype=np.float64) + for e_idx, g_idx in enumerate(eligible): + gid = gid_list[g_idx] + out_idx = first_switch_idx[g_idx] - 1 + l_h + t_val_out = periods_arr[out_idx] + mask_cell = ( + (obs_gids_raw == gid) + & (obs_tids == t_val_out) + & valid + ) + w_cell = obs_w_raw[mask_cell].sum() + if w_cell > 0: + psi_obs[mask_cell] = psi_g[e_idx] * ( + obs_w_raw[mask_cell] / w_cell + ) var_s = compute_survey_if_variance(psi_obs, resolved) df_s_local = df_s se_het = ( @@ -5413,12 +5464,14 @@ def _strata_psu_vary_within_group( ) -> Tuple[bool, bool]: """Return (strata_varies_within_group, psu_varies_within_group). - Diagnostic helper used to gate out-of-scope combinations for the - cell-period IF allocator — heterogeneity and ``n_bootstrap > 0`` - currently require within-group constancy because they read - ``obs_survey_info`` through the legacy group-level expansion path. - PR 3 and PR 4 will extend them. Zero-weight rows are excluded from - the check (subpopulation contract). + Diagnostic helper used at ``fit()`` time to gate the remaining + out-of-scope combination for the cell-period IF allocator: + ``n_bootstrap > 0`` still uses a group-level PSU map and raises + ``NotImplementedError`` when PSU varies within group. The + heterogeneity WLS path supports within-group-varying PSU/strata + via the cell-period allocator (shipped in the PR that lifted the + previous gate). Zero-weight rows are excluded from the check + (subpopulation contract). """ if resolved is None: return False, False @@ -5766,10 +5819,12 @@ def _survey_se_from_group_if( else: # Legacy group-level allocator (no per-period attribution # provided, or time/period info unavailable). Preserved for - # paths that haven't threaded per-period attribution through - # yet (e.g., the heterogeneity psi_obs construction in - # _compute_heterogeneity_test — gated to within-group-constant - # PSU in Stage 2 per PR 2 scope). + # defensive fallback and for unit tests that exercise the + # legacy allocator. No current caller in fit() uses this + # branch — ATT / joiners / leavers / placebos all thread + # U_centered_per_period, and heterogeneity (as of PR 3) + # constructs its own cell-period psi_obs and calls + # compute_survey_if_variance directly. group_to_u = {gid: U_centered[idx] for idx, gid in enumerate(eligible_groups)} u_obs_eff = np.array([group_to_u.get(gid, 0.0) for gid in gids_eff]) unique_gids, inverse = np.unique(gids_eff, return_inverse=True) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 31ba4f72..b501a87b 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`. -- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. **Scope note (cell-period allocator):** Heterogeneity still uses the legacy group-level IF expansion `ψ_i = ψ_g * (w_i / W_g)`. Combining `heterogeneity=` with a survey design whose PSU or strata vary within group raises `NotImplementedError`; the cell-period extension of the heterogeneity path is deferred to a follow-up PR. Within-group-constant PSU/strata (including the auto-injected `psu=group`) is fully supported. +- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the group-level WLS coefficient influence function is `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`. The group-level IF is then attributed to observation level via **one of two allocators, chosen by variance helper** so each path preserves byte-identity for its aggregation rule: (1) **Binder TSL** (`compute_survey_if_variance`) uses the **cell-period single-cell allocator** — at each horizon `l_h`, `ψ_g` is assigned in full to the post-period cell `(g, out_idx)` with `out_idx = first_switch_idx[g] - 1 + l_h` and expanded as `ψ_i = ψ_g * (w_i / W_{g, out_idx})` for obs in that cell, zero elsewhere (matches the DID_l post-period convention in the Survey IF expansion Note below). Under PSU=group per-observation distribution differs from the legacy `ψ_i = ψ_g * (w_i / W_g)`, but PSU-level aggregates telescope to the same `ψ_g` — so Binder TSL variance is byte-identical to the pre-cell-period release under PSU=group. Under within-group-varying PSU mass lands in the post-period PSU of the transition, which is what Binder TSL needs. An **empty post-period cell under zero-weight obs** (all obs at `(g, out_idx)` have `w_i = 0` despite `N > 0`) drops the group's contribution, matching the ATT cell allocator's convention; the pre-cell-period path diverged here by redistributing mass to other cells of the group. (2) **Rao-Wu replicate** (`compute_replicate_if_variance`) uses the **legacy group-level allocator** `ψ_i = ψ_g * (w_i / W_g)`. Replicate variance computes `θ_r = sum_i ratio_ir * ψ_i` at the observation level, so moving `ψ_g` mass onto the post-period cell only would silently change the replicate SE whenever a replicate column's ratios vary within group (the library accepts arbitrary per-row replicate matrices, not just PSU-aligned ones). Keeping the legacy allocator on this branch preserves byte-identity of replicate SE across every previously-supported fit; replicate + within-group-varying PSU is unreachable by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. **Scope note (bootstrap):** Combining `heterogeneity=` with `n_bootstrap > 0` and within-group-varying PSU still raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap uses a group-level PSU map until the follow-up PR extends it. - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning. @@ -649,7 +649,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. -- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require either an explicit `psu=` or the original `SurveyDesign(..., nest=True)` flag** — under `nest=True` the resolver combines `(stratum, psu)` into globally-unique labels, so the auto-injected `psu=` is re-labeled per stratum and the cell allocator proceeds. Only the `nest=False` + varying-strata + omitted-psu combination is rejected up front with a targeted `ValueError` at `fit()` time (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. +- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require either an explicit `psu=` or the original `SurveyDesign(..., nest=True)` flag** — under `nest=True` the resolver combines `(stratum, psu)` into globally-unique labels, so the auto-injected `psu=` is re-labeled per stratum and the cell allocator proceeds. Only the `nest=False` + varying-strata + omitted-psu combination is rejected up front with a targeted `ValueError` at `fit()` time (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Post-period attribution extends to heterogeneity (Binder TSL branch only):** the heterogeneity WLS coefficient IF `ψ_g = inv(X'WX)[1,:] @ x_g * W_g * r_g` is attributed in full to the single post-period cell `(g, out_idx)` at each horizon (same single-cell convention as DID_l), then expanded as `ψ_i = ψ_g * (w_i / W_{g, out_idx})`, and fed through `compute_survey_if_variance`. Under PSU=group the PSU-level aggregate telescopes to `ψ_g`, so Binder variance is byte-identical relative to the pre-cell-period release; under within-group-varying PSU mass lands in the post-period PSU. **Replicate-weight branch keeps the legacy group-level allocator** `ψ_i = ψ_g * (w_i / W_g)` because `compute_replicate_if_variance` computes `θ_r = sum_i ratio_ir * ψ_i` at observation level and is therefore not PSU-telescoping: redistributing mass onto the post-period cell would silently change the replicate SE whenever a replicate column's ratios vary within a group (the library accepts arbitrary per-row replicate matrices, not just PSU-aligned ones). The legacy allocator preserves byte-identity of the replicate SE for every previously-supported fit. Replicate + within-group-varying PSU is unreachable by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). **Scope limitation (follow-up PR):** `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in a follow-up PR. - **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Scope note (cell-period allocator):** The PSU-level bootstrap uses a group-level `group_id_to_psu_code` map and therefore requires PSU to be constant within each group. Combining `n_bootstrap > 0` with a PSU that varies within group raises `NotImplementedError`; the cell-level Hall-Mammen extension is deferred to a follow-up PR. The analytical TSL variance fully supports within-group-varying PSU via the cell-period allocator — use `n_bootstrap=0` for those designs. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. --- diff --git a/tests/test_dcdh_heterogeneity_cell_period_coverage.py b/tests/test_dcdh_heterogeneity_cell_period_coverage.py new file mode 100644 index 00000000..f4012572 --- /dev/null +++ b/tests/test_dcdh_heterogeneity_cell_period_coverage.py @@ -0,0 +1,158 @@ +"""Monte Carlo coverage simulation for the cell-period IF allocator +applied to the heterogeneity WLS path (PR 3). + +Validates empirical null coverage under a DGP with PSU that varies +across cells of the same group — the regime unlocked by PR 3's lift +of the heterogeneity gate. Under within-group-constant PSU the cell +allocator's PSU-level aggregate matches the previous group allocator +byte-for-byte (see `TestHeterogeneityCellPeriod:: +test_psu_level_byte_identity_under_psu_equals_group` in +`test_survey_dcdh.py`), so the coverage check only exercises the new +within-group-varying-PSU code path. + +Null-coverage rationale: + + beta_het_true = 0 is deliberate. The heterogeneity test's + identifying estimand under Lemma 7 is a variance-weighted average + of effect differences, so a non-zero DGP-level beta does not have + a clean "true value" for coverage. A constant-treatment-effect DGP + ensures beta_het_true = 0 exactly, so the CI must cover 0 at + nominal 95% regardless of Lemma 7's variance weighting. + +Marked ``slow`` and excluded from the default pytest run. To execute: + + pytest tests/test_dcdh_heterogeneity_cell_period_coverage.py -m slow -v +""" + +from __future__ import annotations + +import warnings + +import numpy as np +import pandas as pd +import pytest + +from diff_diff.chaisemartin_dhaultfoeuille import ChaisemartinDHaultfoeuille +from diff_diff.survey import SurveyDesign + + +def _simulate_panel( + n_groups: int, + n_periods: int, + first_treated_period: int, + tau: float, + rng: np.random.Generator, + psu_sigma: float = 0.5, + obs_sigma: float = 1.0, + group_sigma: float = 1.0, +) -> pd.DataFrame: + """Generate a one-obs-per-cell panel with within-group-varying PSU + and a group-level time-invariant covariate X_het. + + Treatment effect ``tau`` is constant (independent of ``X_het``) so + the true heterogeneity coefficient on X_het is exactly 0 under + Lemma 7 — the null-coverage target for this test. + """ + groups = np.arange(n_groups) + treated = groups < n_groups // 2 + group_fe = rng.normal(0.0, group_sigma, size=n_groups) + # (group, psu) effect — constant across time within the (g, psu) + # pair. Drives within-group residual correlation that varies by + # cell, so design-based variance must capture it per-PSU. + psu_fe = rng.normal(0.0, psu_sigma, size=(n_groups, 2)) + # Group-level binary covariate (time-invariant). + x_het = rng.binomial(1, 0.5, size=n_groups).astype(float) + + rows = [] + for g in groups: + for t in range(n_periods): + parity = int(t % 2) + # PSU labels are globally unique per (group, parity) so + # Binder variance sees 2 * n_groups sampling clusters + # rather than 2 global PSU codes reused across groups. + # Within-cell constancy (one obs per cell) is trivially + # satisfied. + psu_id = int(g) * 2 + parity + d = 1 if (treated[g] and t >= first_treated_period) else 0 + y = group_fe[g] + 0.1 * t + tau * d + psu_fe[g, parity] + rng.normal(0.0, obs_sigma) + rows.append( + { + "group": int(g), + "period": int(t), + "treatment": int(d), + "outcome": float(y), + "psu": psu_id, + "pw": 1.0, + "x_het": float(x_het[g]), + } + ) + return pd.DataFrame(rows) + + +@pytest.mark.slow +def test_heterogeneity_cell_period_null_coverage_varying_psu(): + """Empirical 95% null coverage for beta_het under the cell-period + allocator on a DGP with within-group-varying PSU. Tolerance ±2.5pp + mirrors the ATT coverage test in test_dcdh_cell_period_coverage.py + (about 2 MC-sigma at n_reps=500 for a true 95% target). + """ + n_reps = 500 + n_groups = 40 + n_periods = 6 + first_treated_period = 3 + tau_true = 2.0 + + rng = np.random.default_rng(20260419) + covered = 0 + failed = 0 + + for r in range(n_reps): + df = _simulate_panel( + n_groups=n_groups, + n_periods=n_periods, + first_treated_period=first_treated_period, + tau=tau_true, + rng=rng, + ) + sd = SurveyDesign(weights="pw", psu="psu") + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + res = ChaisemartinDHaultfoeuille(seed=r + 1).fit( + df, + outcome="outcome", + group="group", + time="period", + treatment="treatment", + heterogeneity="x_het", + survey_design=sd, + L_max=1, + ) + except Exception: + failed += 1 + continue + + if res.heterogeneity_effects is None: + failed += 1 + continue + entry = res.heterogeneity_effects[1] + ci = entry["conf_int"] + if ci is None or not all(np.isfinite(ci)): + failed += 1 + continue + lo, hi = float(ci[0]), float(ci[1]) + # Null coverage: beta_het_true = 0 since tau is constant. + if lo <= 0.0 <= hi: + covered += 1 + + completed = n_reps - failed + assert completed >= int(0.95 * n_reps), ( + f"MC simulation had {failed}/{n_reps} fit failures, above " f"the 5% tolerance." + ) + coverage = covered / completed + assert 0.925 <= coverage <= 0.975, ( + f"Empirical null coverage {coverage:.3f} (completed " + f"{completed}) outside [0.925, 0.975]; tau_true={tau_true}, " + f"beta_het_true=0, n_groups={n_groups}, n_periods={n_periods}, " + f"n_reps={n_reps}." + ) diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index e400c3eb..4ac5472f 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -1075,9 +1075,10 @@ class TestSurveyWithinGroupValidation: """Cell-period IF allocator contract: strata and PSU may vary ACROSS cells of a group, but must be constant WITHIN each (g, t) cell. In canonical one-obs-per-cell panels the cell-level constancy check is - trivially satisfied. Out-of-scope combinations (heterogeneity + - within-group-varying PSU; n_bootstrap > 0 + within-group-varying - PSU) raise NotImplementedError with a pointer to the follow-up PR. + trivially satisfied. Heterogeneity + within-group-varying PSU is + supported via the cell-period allocator. The remaining out-of-scope + combination is n_bootstrap > 0 + within-group-varying PSU, which + still raises NotImplementedError with a pointer to the follow-up PR. """ def test_accepts_varying_psu_within_group(self, base_data): @@ -1138,23 +1139,35 @@ def test_accepts_varying_strata_within_group(self, base_data): assert np.isfinite(result.overall_att) assert np.isfinite(result.overall_se) - def test_heterogeneity_with_varying_psu_raises(self, base_data): - """heterogeneity= is gated under within-group-varying PSU until - PR 3 ships the cell-period allocator for the WLS psi_obs path. + def test_heterogeneity_with_varying_psu_succeeds(self, base_data): + """heterogeneity= is supported under within-group-varying PSU + via the cell-period allocator: psi_g is attributed in full to + the (g, out_idx) post-period cell and expanded to obs level as + psi_i = psi_g * (w_i / W_{g, out_idx}). All five inference + fields must be finite when the survey design is regular. """ df_ = base_data.copy() df_["pw"] = 1.0 - df_["stratum"] = 0 - df_["psu"] = df_["period"] % 2 # varies within group - df_["x_het"] = np.arange(len(df_)) % 3 # categorical covariate - sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") - with pytest.raises(NotImplementedError, match="heterogeneity"): - ChaisemartinDHaultfoeuille(seed=1).fit( - df_, outcome="outcome", group="group", - time="period", treatment="treatment", - heterogeneity="x_het", L_max=1, - survey_design=sd, - ) + # Make x_het time-invariant within each group (heterogeneity + # test requires a group-level covariate). + df_["x_het"] = (df_["group"].astype(int) % 2).astype(float) + # Per-group PSU parity — unique per (group, parity), varies + # within group so the cell-period allocator is exercised. + df_["psu"] = df_["group"].astype(int) * 2 + (df_["period"].astype(int) % 2) + sd = SurveyDesign(weights="pw", psu="psu") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + heterogeneity="x_het", L_max=1, + survey_design=sd, + ) + assert res.heterogeneity_effects is not None + entry = res.heterogeneity_effects[1] + assert np.isfinite(entry["beta"]) + assert np.isfinite(entry["se"]) and entry["se"] >= 0.0 + assert np.isfinite(entry["t_stat"]) + assert np.isfinite(entry["p_value"]) + assert all(np.isfinite(entry["conf_int"])) def test_bootstrap_with_varying_psu_raises(self, base_data): """n_bootstrap > 0 is gated under within-group-varying PSU until @@ -1212,6 +1225,59 @@ def test_auto_inject_with_varying_strata_nest_true_succeeds(self, base_data): == r_explicit.survey_metadata.df_survey ) + def test_heterogeneity_auto_inject_with_varying_strata_nest_true_succeeds(self, base_data): + """PR 3 unblocks heterogeneity + SurveyDesign(strata, nest=True) + with auto-inject psu=group. Under nest=True the resolver + combines (stratum, psu) into globally-unique labels, so + resolved.psu varies across cells of each group and the + cell-period allocator handles it. Mirrors ATT coverage at + test_auto_inject_with_varying_strata_nest_true_succeeds. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = df_["period"] % 2 + df_["x_het"] = (df_["group"].astype(int) % 2).astype(float) + sd = SurveyDesign(weights="pw", strata="stratum", nest=True) + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + heterogeneity="x_het", L_max=1, + survey_design=sd, + ) + assert res.heterogeneity_effects is not None + entry = res.heterogeneity_effects[1] + assert np.isfinite(entry["beta"]) + assert np.isfinite(entry["se"]) and entry["se"] >= 0.0 + assert np.isfinite(entry["t_stat"]) + assert np.isfinite(entry["p_value"]) + assert all(np.isfinite(entry["conf_int"])) + + def test_heterogeneity_multi_horizon_varying_psu_succeeds(self, base_data): + """Multi-horizon heterogeneity (L_max >= 2) + within-group- + varying PSU — each horizon rebuilds its own (g, out_idx) cell + mapping, so the per-horizon allocator must produce finite + inference independently at every horizon. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["x_het"] = (df_["group"].astype(int) % 2).astype(float) + df_["psu"] = df_["group"].astype(int) * 2 + (df_["period"].astype(int) % 2) + sd = SurveyDesign(weights="pw", psu="psu") + res = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + heterogeneity="x_het", L_max=2, + survey_design=sd, + ) + assert res.heterogeneity_effects is not None + for horizon in (1, 2): + entry = res.heterogeneity_effects[horizon] + assert np.isfinite(entry["beta"]), f"beta NaN at horizon {horizon}" + assert np.isfinite(entry["se"]) and entry["se"] >= 0.0 + assert np.isfinite(entry["t_stat"]) + assert np.isfinite(entry["p_value"]) + assert all(np.isfinite(entry["conf_int"])) + def test_auto_inject_with_varying_strata_raises(self, base_data): """Auto-injected `psu=` with nest=False cannot honor strata that vary across cells of a group — the synthesized PSU @@ -1543,3 +1609,173 @@ def test_within_cell_check_excludes_zero_weight_rows(self, base_data): survey_design=sd, ) assert np.isfinite(result.overall_att) + + +class TestHeterogeneityCellPeriod: + """Unit tests for the heterogeneity cell-period allocator invariants. + + Under PSU=group the new single-cell psi_obs distribution + (mass in the (g, out_idx) post-period cell only, scaled by + w_i / W_{g, out_idx}) differs from the legacy group-level + distribution (mass everywhere in g, scaled by w_i / W_g) at the + observation level. But both telescope to the same PSU-level sum + psi_g because compute_survey_if_variance aggregates to PSU first. + Binder TSL variance must therefore be byte-identical. + """ + + def test_psu_level_byte_identity_under_psu_equals_group(self): + """Construct both legacy and new psi_obs on a tiny fixture + (PSU=group, one obs per cell), feed both through + compute_survey_if_variance, and assert variances equal + within ULP — the exact invariant the REGISTRY Note claims. + """ + from diff_diff.survey import ( + SurveyDesign, + compute_survey_if_variance, + ) + + # Fixture: 4 groups * 4 periods = 16 obs, one obs per cell, + # pw=1 everywhere. PSU=group so each group is a single PSU. + n_groups, n_periods = 4, 4 + rows = [] + for g in range(n_groups): + for t in range(n_periods): + rows.append({ + "group": int(g), + "period": int(t), + "pw": 1.0, + }) + df_ = pd.DataFrame(rows) + sd = SurveyDesign(weights="pw", psu="group") + resolved = sd.resolve(df_) + + # Arbitrary non-zero group-level IF values + per-group out_idx + # (the post-period cell chosen by the heterogeneity horizon). + psi_g = np.array([1.0, -0.5, 2.0, -1.5], dtype=np.float64) + out_idx_per_group = np.array([3, 3, 2, 2], dtype=np.int64) + + obs_group = df_["group"].values.astype(np.int64) + obs_period = df_["period"].values.astype(np.int64) + w = df_["pw"].to_numpy(dtype=np.float64) + + # Legacy expansion: psi_i = psi_g[g_i] * w_i / W_g. + W_g = np.zeros(n_groups, dtype=np.float64) + np.add.at(W_g, obs_group, w) + psi_legacy = np.zeros_like(w) + for i in range(len(w)): + if W_g[obs_group[i]] > 0: + psi_legacy[i] = psi_g[obs_group[i]] * (w[i] / W_g[obs_group[i]]) + + # New expansion: psi_i = psi_g[g_i] * w_i / W_{g, out_idx} + # for obs in (g, out_idx), zero elsewhere. + W_cell_out = np.zeros(n_groups, dtype=np.float64) + for i in range(len(w)): + if obs_period[i] == out_idx_per_group[obs_group[i]]: + W_cell_out[obs_group[i]] += w[i] + psi_new = np.zeros_like(w) + for i in range(len(w)): + g_i = obs_group[i] + if obs_period[i] == out_idx_per_group[g_i] and W_cell_out[g_i] > 0: + psi_new[i] = psi_g[g_i] * (w[i] / W_cell_out[g_i]) + + # Distributions differ at the obs level. + assert not np.allclose(psi_legacy, psi_new), ( + "fixture should exercise the mass redistribution — " + "legacy spreads across all obs of g, new concentrates in " + "(g, out_idx)." + ) + + # PSU-level sums must match — this is the invariant the + # REGISTRY Note claims under PSU=group. + assert resolved.psu is not None + psu_codes = np.asarray(resolved.psu, dtype=np.int64) + psu_sum_legacy = np.zeros(n_groups, dtype=np.float64) + psu_sum_new = np.zeros(n_groups, dtype=np.float64) + np.add.at(psu_sum_legacy, psu_codes, psi_legacy) + np.add.at(psu_sum_new, psu_codes, psi_new) + assert np.allclose(psu_sum_legacy, psu_sum_new, atol=0.0, rtol=1e-15) + + # Binder TSL variances must match byte-for-byte (single stratum, + # each PSU sum contributes equally in both paths). + var_legacy = compute_survey_if_variance(psi_legacy, resolved) + var_new = compute_survey_if_variance(psi_new, resolved) + assert np.isfinite(var_legacy) and np.isfinite(var_new) + assert var_legacy == pytest.approx(var_new, rel=1e-14, abs=1e-14) + + def test_replicate_variance_non_invariance_under_varying_ratios(self): + """When replicate-weight ratios vary within group, + compute_replicate_if_variance is NOT PSU-telescoping — its + theta_r = sum_i ratio_ir * psi_i reads psi at observation + level, so redistributing mass across cells of g changes the + replicate variance. This is why the heterogeneity path keeps + the legacy group-level allocator on the replicate branch + (only Binder TSL uses the cell-period allocator). Regression + guard for the CI-review P1 on PR #329. + """ + from diff_diff.survey import ( + SurveyDesign, + compute_replicate_if_variance, + ) + + # Minimal reproduction of the reviewer's counterexample: one + # group with two obs (ref and post-period cells), replicate + # ratios that vary within the group. legacy allocator spreads + # psi_g across both obs; new cell-period allocator puts it + # all on the post-period cell. Replicate variance differs. + rows = [] + for g in range(2): + for t in range(2): + rows.append({ + "group": int(g), + "period": int(t), + "pw": 1.0, + # Non-PSU-aligned replicate columns: vary within + # each group so sum_i ratio_ir * psi_i sees a + # different weighted average of psi mass under + # the two allocators. + "rep0": 0.5 if t == 0 else 1.5, + "rep1": 1.5 if t == 0 else 0.5, + }) + df_ = pd.DataFrame(rows) + sd = SurveyDesign( + weights="pw", + replicate_weights=["rep0", "rep1"], + replicate_method="SDR", + ) + resolved = sd.resolve(df_) + + # Arbitrary non-zero psi_g per group; out_idx = 1 for each. + psi_g = np.array([0.75, -1.25], dtype=np.float64) + obs_group = df_["group"].values.astype(np.int64) + obs_period = df_["period"].values.astype(np.int64) + w = df_["pw"].to_numpy(dtype=np.float64) + + # Legacy group-level expansion (what the replicate branch now + # uses inside _compute_heterogeneity_test after the PR-329 fix). + W_g = np.zeros(2, dtype=np.float64) + np.add.at(W_g, obs_group, w) + psi_legacy = np.zeros_like(w) + for i in range(len(w)): + if W_g[obs_group[i]] > 0: + psi_legacy[i] = psi_g[obs_group[i]] * ( + w[i] / W_g[obs_group[i]] + ) + + # New cell-period single-cell expansion (what the Binder TSL + # branch uses). All mass lands on obs at t == 1. + psi_new = np.zeros_like(w) + for i in range(len(w)): + if obs_period[i] == 1: + psi_new[i] = psi_g[obs_group[i]] # W_{g,out_idx}=1 + + var_legacy, _ = compute_replicate_if_variance(psi_legacy, resolved) + var_new, _ = compute_replicate_if_variance(psi_new, resolved) + assert np.isfinite(var_legacy) and np.isfinite(var_new) + # Documented non-invariance: replicate variance differs + # materially between the two allocators on this fixture. + assert not np.isclose(var_legacy, var_new, rtol=1e-6), ( + f"Expected legacy vs cell-period replicate variance to " + f"differ when replicate ratios vary within group " + f"(counterexample from PR #329 CI review). Got " + f"var_legacy={var_legacy}, var_new={var_new}." + )