From 37bb13ee025728cb3b2f805fd4706b38cd20d6e3 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 06:27:12 -0400 Subject: [PATCH 1/2] Extend PR #312 Y-normalization contract into SDID diagnostic methods PR #312 centered and scaled Y once in SyntheticDiD.fit() to avoid catastrophic cancellation in the SDID double-difference at extreme Y. The fix did not reach the two public diagnostic methods on SyntheticDiDResults: in_time_placebo() and sensitivity_to_zeta_omega() both re-run Frank-Wolfe on the original-scale fit snapshot with original-scale zetas, so at Y ~ 1e9 they reproduce the same silent failure the main path was fixed for. Capture Y_shift and Y_scale on the fit snapshot, apply the same (Y - shift) / scale normalization inside the two diagnostic methods, pass zeta / Y_scale and min_decrease derived from the normalized noise level to the FW weight solvers, and rescale att / pre_fit_rmse back to original-Y units before reporting. Unit-weight diagnostics (max_unit_weight, effective_n) are scale-invariant and reported without rescaling. Regression coverage: - TestDiagnosticScaleParity: att / pre_fit_rmse must scale by |a| across (Y -> a*Y + b) for both diagnostic methods; a no-effect DGP at Y ~ 1e9 must produce finite placebo atts landing within 5 * noise_level. - TestHeterogeneousAndRampingScale: cross-unit heterogeneous scale (units spanning 1e6 to 1e9) and cross-period ramping (trend growing 4 orders of magnitude across periods) must leave the fit and both diagnostic surfaces finite. These are the heterogeneity pathways the original TestScaleEquivariance (affine-only) suite did not cover. Covered by audit axis A (numerical precision / scale fragility). Findings D-4 and D-4b from docs/audits/silent-failures-findings.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/results.py | 117 ++++++++++++------- diff_diff/synthetic_did.py | 2 + docs/methodology/REGISTRY.md | 2 +- tests/test_methodology_sdid.py | 208 +++++++++++++++++++++++++++++++++ 4 files changed, 284 insertions(+), 45 deletions(-) diff --git a/diff_diff/results.py b/diff_diff/results.py index 504a84d0..e15e9927 100644 --- a/diff_diff/results.py +++ b/diff_diff/results.py @@ -669,6 +669,12 @@ class _SyntheticDiDFitSnapshot: post_periods: List[Any] w_control: Optional[np.ndarray] = None w_treated: Optional[np.ndarray] = None + # Normalization constants captured during fit() so diagnostic methods can + # reproduce the main path's centering+scaling and avoid catastrophic + # cancellation on extreme-Y panels. Defaults preserve behavior for + # snapshots built before these fields existed. + Y_shift: float = 0.0 + Y_scale: float = 1.0 def __post_init__(self): for arr in ( @@ -1144,8 +1150,17 @@ def in_time_placebo( "in_time_placebo() needs zeta_omega and zeta_lambda from the " "original fit. Expected on the results object but found None." ) + # Reproduce the main fit path's Y normalization (Y → (Y - shift) / scale) + # so Frank-Wolfe sees the same ~O(1) inputs it saw during fit() and the + # SDID double-difference does not suffer ~6-digit cancellation at + # extreme Y. See SyntheticDiD.fit() and REGISTRY.md §SyntheticDiD. + Y_shift = snap.Y_shift + Y_scale = snap.Y_scale + zeta_omega_n = zeta_omega / Y_scale + zeta_lambda_n = zeta_lambda / Y_scale noise_level = self.noise_level if self.noise_level is not None else 0.0 - min_decrease = 1e-5 * noise_level if noise_level > 0 else 1e-5 + noise_level_n = noise_level / Y_scale + min_decrease = 1e-5 * noise_level_n if noise_level_n > 0 else 1e-5 # Build the list of (fake_period, position) pairs to iterate. period_to_idx = {p: i for i, p in enumerate(pre_periods)} @@ -1195,29 +1210,29 @@ def in_time_placebo( rows.append(row) continue - Y_pre_c = snap.Y_pre_control[:i, :] - Y_post_c = snap.Y_pre_control[i:, :] - Y_pre_t = snap.Y_pre_treated[:i, :] - Y_post_t = snap.Y_pre_treated[i:, :] + Y_pre_c_n = (snap.Y_pre_control[:i, :] - Y_shift) / Y_scale + Y_post_c_n = (snap.Y_pre_control[i:, :] - Y_shift) / Y_scale + Y_pre_t_n = (snap.Y_pre_treated[:i, :] - Y_shift) / Y_scale + Y_post_t_n = (snap.Y_pre_treated[i:, :] - Y_shift) / Y_scale if snap.w_treated is not None: w_t = snap.w_treated - y_pre_t_mean = np.average(Y_pre_t, axis=1, weights=w_t) - y_post_t_mean = np.average(Y_post_t, axis=1, weights=w_t) + y_pre_t_mean_n = np.average(Y_pre_t_n, axis=1, weights=w_t) + y_post_t_mean_n = np.average(Y_post_t_n, axis=1, weights=w_t) else: - y_pre_t_mean = np.mean(Y_pre_t, axis=1) - y_post_t_mean = np.mean(Y_post_t, axis=1) + y_pre_t_mean_n = np.mean(Y_pre_t_n, axis=1) + y_post_t_mean_n = np.mean(Y_post_t_n, axis=1) omega_fake = compute_sdid_unit_weights( - Y_pre_c, - y_pre_t_mean, - zeta_omega=zeta_omega, + Y_pre_c_n, + y_pre_t_mean_n, + zeta_omega=zeta_omega_n, min_decrease=min_decrease, ) lambda_fake = compute_time_weights( - Y_pre_c, - Y_post_c, - zeta_lambda=zeta_lambda, + Y_pre_c_n, + Y_post_c_n, + zeta_lambda=zeta_lambda_n, min_decrease=min_decrease, ) @@ -1231,20 +1246,22 @@ def in_time_placebo( else: omega_eff_fake = omega_fake - att_fake = compute_sdid_estimator( - Y_pre_c, - Y_post_c, - y_pre_t_mean, - y_post_t_mean, + att_fake_n = compute_sdid_estimator( + Y_pre_c_n, + Y_post_c_n, + y_pre_t_mean_n, + y_post_t_mean_n, omega_eff_fake, lambda_fake, ) - synthetic_pre_fake = Y_pre_c @ omega_eff_fake - pre_fit = float( - np.sqrt(np.mean((y_pre_t_mean - synthetic_pre_fake) ** 2)) + synthetic_pre_fake_n = Y_pre_c_n @ omega_eff_fake + pre_fit_n = float( + np.sqrt(np.mean((y_pre_t_mean_n - synthetic_pre_fake_n) ** 2)) ) - row["att"] = float(att_fake) - row["pre_fit_rmse"] = pre_fit + # ATT is scale-equivariant and shift-invariant in Y; RMSE is + # scale-equivariant. Rescale back to original-Y units. + row["att"] = float(att_fake_n * Y_scale) + row["pre_fit_rmse"] = pre_fit_n * Y_scale rows.append(row) return pd.DataFrame(rows) @@ -1320,19 +1337,29 @@ def sensitivity_to_zeta_omega( else: zeta_values = [float(z) for z in zeta_grid] + # Reproduce the main fit path's Y normalization so FW sees ~O(1) + # inputs; see in_time_placebo() for the same pattern. + Y_shift = snap.Y_shift + Y_scale = snap.Y_scale noise_level = self.noise_level if self.noise_level is not None else 0.0 - min_decrease = 1e-5 * noise_level if noise_level > 0 else 1e-5 + noise_level_n = noise_level / Y_scale + min_decrease = 1e-5 * noise_level_n if noise_level_n > 0 else 1e-5 + + Y_pre_control_n = (snap.Y_pre_control - Y_shift) / Y_scale + Y_post_control_n = (snap.Y_post_control - Y_shift) / Y_scale + Y_pre_treated_n = (snap.Y_pre_treated - Y_shift) / Y_scale + Y_post_treated_n = (snap.Y_post_treated - Y_shift) / Y_scale if snap.w_treated is not None: - y_pre_t_mean = np.average( - snap.Y_pre_treated, axis=1, weights=snap.w_treated + y_pre_t_mean_n = np.average( + Y_pre_treated_n, axis=1, weights=snap.w_treated ) - y_post_t_mean = np.average( - snap.Y_post_treated, axis=1, weights=snap.w_treated + y_post_t_mean_n = np.average( + Y_post_treated_n, axis=1, weights=snap.w_treated ) else: - y_pre_t_mean = np.mean(snap.Y_pre_treated, axis=1) - y_post_t_mean = np.mean(snap.Y_post_treated, axis=1) + y_pre_t_mean_n = np.mean(Y_pre_treated_n, axis=1) + y_post_t_mean_n = np.mean(Y_post_treated_n, axis=1) columns = [ "zeta_omega", @@ -1348,9 +1375,9 @@ def sensitivity_to_zeta_omega( rows: List[Dict[str, Any]] = [] for z in zeta_values: omega_fake = compute_sdid_unit_weights( - snap.Y_pre_control, - y_pre_t_mean, - zeta_omega=z, + Y_pre_control_n, + y_pre_t_mean_n, + zeta_omega=z / Y_scale, min_decrease=min_decrease, ) if snap.w_control is not None: @@ -1371,22 +1398,24 @@ def sensitivity_to_zeta_omega( else: omega_eff = omega_fake - att = compute_sdid_estimator( - snap.Y_pre_control, - snap.Y_post_control, - y_pre_t_mean, - y_post_t_mean, + att_n = compute_sdid_estimator( + Y_pre_control_n, + Y_post_control_n, + y_pre_t_mean_n, + y_post_t_mean_n, omega_eff, time_weights, ) - synthetic_pre = snap.Y_pre_control @ omega_eff - pre_fit = float(np.sqrt(np.mean((y_pre_t_mean - synthetic_pre) ** 2))) + synthetic_pre_n = Y_pre_control_n @ omega_eff + pre_fit_n = float(np.sqrt(np.mean((y_pre_t_mean_n - synthetic_pre_n) ** 2))) herf = float(np.sum(omega_eff ** 2)) rows.append( { "zeta_omega": z, - "att": float(att), - "pre_fit_rmse": pre_fit, + # Unit weights are scale-invariant; ATT and RMSE are + # scale-equivariant. Report original-Y units. + "att": float(att_n * Y_scale), + "pre_fit_rmse": pre_fit_n * Y_scale, "max_unit_weight": float(np.max(omega_eff)), "effective_n": float("nan") if herf == 0 else 1.0 / herf, } diff --git a/diff_diff/synthetic_did.py b/diff_diff/synthetic_did.py index e3f43740..bddaacb9 100644 --- a/diff_diff/synthetic_did.py +++ b/diff_diff/synthetic_did.py @@ -667,6 +667,8 @@ def fit( # type: ignore[override] post_periods=list(post_periods), w_control=w_control, w_treated=w_treated, + Y_shift=Y_shift, + Y_scale=Y_scale, ) # Freeze the public diagnostic arrays so mutation via the results diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 31ba4f72..810b8fc7 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -1519,7 +1519,7 @@ Convergence criterion: stop when objective decrease < min_decrease² (default mi - **Jackknife with non-finite LOO estimate**: Returns NaN SE. Unlike bootstrap/placebo, jackknife is deterministic and cannot skip failed iterations; NaN propagates through `var()` (matches R behavior). - **Jackknife with survey weights**: Guards on effective positive support (omega * w_control > 0 and w_treated > 0) after composition, not raw FW counts. Returns NaN SE if fewer than 2 effective controls or 2 positive-weight treated units. Per-iteration zero-sum guards return NaN for individual LOO iterations when remaining composed weights sum to zero. - **Note:** Survey support: weights, strata, PSU, and FPC are all supported. Full-design surveys use Rao-Wu rescaled bootstrap (Phase 6); non-bootstrap variance methods (`variance_method="placebo"` or `"jackknife"`) require weights-only (strata/PSU/FPC require bootstrap). Both sides weighted per WLS regression interpretation: treated-side means are survey-weighted (Frank-Wolfe target and ATT formula); control-side synthetic weights are composed with survey weights post-optimization (ω_eff = ω * w_co, renormalized). Frank-Wolfe optimization itself is unweighted — survey importance enters after trajectory-matching. Covariate residualization uses WLS with survey weights. Placebo, jackknife, and bootstrap SE preserve survey weights on both sides. -- **Note:** Internal Y normalization. Before weight optimization, the estimator, and variance procedures, `fit()` centers Y by `mean(Y_pre_control)` and scales by `std(Y_pre_control)`; `Y_scale` falls back to `1.0` when std is non-finite or below `1e-12 * max(|mean|, 1)`. Auto-regularization and `noise_level` are computed on normalized Y; user-supplied `zeta_omega` / `zeta_lambda` are divided by `Y_scale` internally for Frank-Wolfe. τ, SE, CI, the placebo/bootstrap/jackknife effect vectors, `results_.noise_level`, and `results_.zeta_omega` / `results_.zeta_lambda` are all reported on the user's original outcome scale (user-supplied zetas are echoed back exactly to avoid float roundoff). Mathematically a no-op — τ is location-invariant and scale-equivariant, and FW weights are invariant under `(Y, ζ) → (Y/s, ζ/s)` — but prevents catastrophic cancellation in the SDID double-difference when outcomes span millions-to-billions (see synth-inference/synthdid#71 for the R-package version of this issue). Normalization constants are derived from controls' pre-period only so the reference is unaffected by treatment. Scope: `in_time_placebo()` and `sensitivity_to_zeta_omega()` continue to run on the stored original-scale snapshot with original-scale zetas (preserving pre-fix behavior); extending normalization into those diagnostic paths is tracked separately. +- **Note:** Internal Y normalization. Before weight optimization, the estimator, and variance procedures, `fit()` centers Y by `mean(Y_pre_control)` and scales by `std(Y_pre_control)`; `Y_scale` falls back to `1.0` when std is non-finite or below `1e-12 * max(|mean|, 1)`. Auto-regularization and `noise_level` are computed on normalized Y; user-supplied `zeta_omega` / `zeta_lambda` are divided by `Y_scale` internally for Frank-Wolfe. τ, SE, CI, the placebo/bootstrap/jackknife effect vectors, `results_.noise_level`, and `results_.zeta_omega` / `results_.zeta_lambda` are all reported on the user's original outcome scale (user-supplied zetas are echoed back exactly to avoid float roundoff). Mathematically a no-op — τ is location-invariant and scale-equivariant, and FW weights are invariant under `(Y, ζ) → (Y/s, ζ/s)` — but prevents catastrophic cancellation in the SDID double-difference when outcomes span millions-to-billions (see synth-inference/synthdid#71 for the R-package version of this issue). Normalization constants are derived from controls' pre-period only so the reference is unaffected by treatment. `in_time_placebo()` and `sensitivity_to_zeta_omega()` reuse the exact same `Y_shift` / `Y_scale` captured on the fit snapshot: they normalize the re-sliced arrays before re-running Frank-Wolfe, pass `zeta / Y_scale` to the weight solvers, and rescale the returned `att` and `pre_fit_rmse` by `Y_scale` before reporting; unit-weight diagnostics (`max_unit_weight`, `effective_n`) are scale-invariant and reported directly. *Validation diagnostics (post-fit methods on `SyntheticDiDResults`):* diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py index c7a77545..87f61acc 100644 --- a/tests/test_methodology_sdid.py +++ b/tests/test_methodology_sdid.py @@ -2296,3 +2296,211 @@ def test_detects_true_effect_at_extreme_scale(self, variance_method): f"Effect at Y~1e9 must reject null; p_value={r.p_value} " f"(variance_method={variance_method})" ) + + +class TestDiagnosticScaleParity: + """Post-PR #312: in_time_placebo() and sensitivity_to_zeta_omega() must + inherit the same original-scale normalization contract that the main fit + path uses. Both diagnostics re-run Frank-Wolfe on the stored fit-snapshot + arrays, so at extreme Y they previously re-created the catastrophic + cancellation PR #312 fixed on the main path (audit finding D-4).""" + + _SCALES = [(1.0, 0.0), (1e6, 1e9), (1e9, -1e6)] + + @staticmethod + def _rescale(df, a, b): + out = df.copy() + out["outcome"] = a * out["outcome"] + b + return out + + @staticmethod + def _fit(data, seed=1): + return SyntheticDiD(variance_method="jackknife", seed=seed).fit( + data, outcome="outcome", treatment="treated", + unit="unit", time="period", + post_periods=[5, 6, 7], + ) + + def test_in_time_placebo_scale_equivariance(self): + """in_time_placebo att/pre_fit_rmse must scale by |a| across + (Y → a*Y + b). Pre-fix at extreme scale the diagnostic re-ran FW on + original-scale snapshot arrays and cancellation corrupted att.""" + data = _make_panel(seed=42) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r0 = self._fit(data) + placebo0 = r0.in_time_placebo() + fake_periods = placebo0["fake_treatment_period"].tolist() + + for a, b in self._SCALES: + scaled = self._rescale(data, a, b) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = self._fit(scaled) + placebo = r.in_time_placebo(fake_treatment_periods=fake_periods) + + assert list(placebo["fake_treatment_period"]) == fake_periods + for row0, row in zip(placebo0.to_dict("records"), + placebo.to_dict("records")): + if np.isnan(row0["att"]): + assert np.isnan(row["att"]), f"att at (a={a}, b={b})" + assert np.isnan(row["pre_fit_rmse"]) + continue + assert row["att"] / a == pytest.approx(row0["att"], rel=1e-6), ( + f"att at (a={a}, b={b}), " + f"fake_period={row0['fake_treatment_period']}" + ) + assert row["pre_fit_rmse"] / abs(a) == pytest.approx( + row0["pre_fit_rmse"], rel=1e-6 + ), ( + f"pre_fit_rmse at (a={a}, b={b}), " + f"fake_period={row0['fake_treatment_period']}" + ) + + def test_sensitivity_to_zeta_omega_scale_equivariance(self): + """sensitivity_to_zeta_omega att/pre_fit_rmse must scale by |a|; + unit-weight diagnostics (max_unit_weight, effective_n) must be + scale-invariant.""" + data = _make_panel(seed=42) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r0 = self._fit(data) + sens0 = r0.sensitivity_to_zeta_omega() + zeta_grid = sens0["zeta_omega"].tolist() + + for a, b in self._SCALES: + scaled = self._rescale(data, a, b) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = self._fit(scaled) + sens = r.sensitivity_to_zeta_omega( + zeta_grid=[a * z for z in zeta_grid] + ) + for row0, row in zip(sens0.to_dict("records"), + sens.to_dict("records")): + assert row["att"] / a == pytest.approx(row0["att"], rel=1e-6), ( + f"att at (a={a}, b={b}), zeta={row0['zeta_omega']}" + ) + assert row["pre_fit_rmse"] / abs(a) == pytest.approx( + row0["pre_fit_rmse"], rel=1e-6 + ) + # omega-derived diagnostics are scale-invariant. + assert row["max_unit_weight"] == pytest.approx( + row0["max_unit_weight"], rel=1e-6 + ) + assert row["effective_n"] == pytest.approx( + row0["effective_n"], rel=1e-6 + ) + + def test_in_time_placebo_detectable_at_extreme_scale(self): + """Pre-fix regression: at Y~1e9 the placebo re-fit corrupted ATTs via + cancellation so diagnostic numbers were garbage. Post-fix, all + placebo rows on a zero-effect DGP must be finite and at least one + must land within 5*noise_level in original-Y units.""" + rng = np.random.default_rng(0) + n_control, n_treated, n_pre, n_post = 20, 3, 7, 3 + baseline_level = 1e9 + rows = [] + for unit in range(n_control + n_treated): + unit_fe = rng.normal(0, 2e6) + for t in range(n_pre + n_post): + y = baseline_level + unit_fe + t * 3e5 + rng.normal(0, 5e5) + rows.append({"unit": unit, "period": t, + "treated": int(unit >= n_control), "outcome": y}) + data = pd.DataFrame(rows) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = self._fit(data, seed=7) + + placebo = r.in_time_placebo() + finite = placebo.dropna(subset=["att"]) + assert len(finite) > 0 + assert np.all(np.isfinite(finite["att"].values)) + assert np.all(np.isfinite(finite["pre_fit_rmse"].values)) + assert (np.abs(finite["att"]) < 5.0 * r.noise_level).any(), ( + f"At least one placebo row should be within 5*noise_level " + f"({5.0 * r.noise_level}); got atts {finite['att'].tolist()}" + ) + + +class TestHeterogeneousAndRampingScale: + """D-4b: the existing TestScaleEquivariance suite is affine-only + (Y → a*Y + b with a single scalar a). These pathways are not covered: + + - Cross-unit heterogeneous scale: different units span 1e6 to 1e9. + - Cross-period ramping: baseline trend growing several orders of + magnitude across periods. + + Both were candidate triggers for the original SDID silent-failure report + and must stay detectable after any future refactor of the normalization + contract.""" + + @staticmethod + def _fit(data, seed=1): + return SyntheticDiD(variance_method="jackknife", seed=seed).fit( + data, outcome="outcome", treatment="treated", + unit="unit", time="period", + post_periods=[5, 6, 7], + ) + + def test_cross_unit_heterogeneous_scale(self): + """Units spanning 1e6 to 1e9 must still produce finite fit and + diagnostic output. Heterogeneous levels historically triggered the + cancellation pathway even at modest Y; this is a regression trap + for future refactors of the normalization contract.""" + rng = np.random.default_rng(11) + n_control, n_treated, n_pre, n_post = 20, 3, 6, 3 + rows = [] + for unit in range(n_control + n_treated): + unit_level = 10 ** rng.uniform(6, 9) + is_treated = unit >= n_control + for t in range(n_pre + n_post): + y = unit_level * (1 + 0.02 * t) + rng.normal(0, unit_level * 0.01) + if is_treated and t >= n_pre: + y += 0.05 * unit_level + rows.append({"unit": unit, "period": t, + "treated": int(is_treated), "outcome": y}) + data = pd.DataFrame(rows) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = self._fit(data) + + assert np.isfinite(r.att) and np.isfinite(r.se) + assert r.se > 0 + placebo = r.in_time_placebo() + assert np.all(np.isfinite(placebo["att"].dropna())) + assert np.all(np.isfinite(placebo["pre_fit_rmse"].dropna())) + sens = r.sensitivity_to_zeta_omega() + assert np.all(np.isfinite(sens["att"])) + assert np.all(np.isfinite(sens["pre_fit_rmse"])) + + def test_cross_period_ramping_trend(self): + """A strong cross-period trend (baseline level multiplies across + periods) must still produce a detectable, finite ATT and finite + diagnostic output.""" + rng = np.random.default_rng(13) + n_control, n_treated, n_pre, n_post = 20, 3, 6, 3 + rows = [] + for unit in range(n_control + n_treated): + unit_fe = rng.normal(0, 1.0) + is_treated = unit >= n_control + for t in range(n_pre + n_post): + trend = 10 ** (5 + 0.4 * t) + y = trend + unit_fe * trend * 0.01 + rng.normal(0, trend * 0.005) + if is_treated and t >= n_pre: + y += 0.01 * trend + rows.append({"unit": unit, "period": t, + "treated": int(is_treated), "outcome": y}) + data = pd.DataFrame(rows) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = self._fit(data) + + assert np.isfinite(r.att) and np.isfinite(r.se) + assert r.se > 0 + placebo = r.in_time_placebo() + assert np.all(np.isfinite(placebo["att"].dropna())) + assert np.all(np.isfinite(placebo["pre_fit_rmse"].dropna())) + sens = r.sensitivity_to_zeta_omega() + assert np.all(np.isfinite(sens["att"])) + assert np.all(np.isfinite(sens["pre_fit_rmse"])) From b72c0b9ee51ac65b746e893b8ffd8b76dc72e17e Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 06:29:54 -0400 Subject: [PATCH 2/2] Address local AI review P3: backward-compat test for legacy snapshots Pre-fix _SyntheticDiDFitSnapshot objects don't carry Y_shift / Y_scale fields. After this PR those fields default to (0.0, 1.0), which makes the new normalization path a pure no-op on the legacy snapshot. Add a regression test that overwrites results_._fit_snapshot with a manually-constructed snapshot using only the pre-PR fields, then confirms both in_time_placebo() and sensitivity_to_zeta_omega() preserve schema and row count. Locks in the backward-compatibility contract so future refactors don't accidentally tighten the normalization path. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_methodology_sdid.py | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py index 87f61acc..20b6f203 100644 --- a/tests/test_methodology_sdid.py +++ b/tests/test_methodology_sdid.py @@ -2423,6 +2423,63 @@ def test_in_time_placebo_detectable_at_extreme_scale(self): ) +class TestDiagnosticSnapshotBackwardCompat: + """Locks in the backward-compatibility contract for legacy + _SyntheticDiDFitSnapshot objects that pre-date Y_shift/Y_scale. Their + defaults (0.0, 1.0) must make the new normalization a pure no-op so + older cached snapshots still drive diagnostic refits unchanged.""" + + def test_legacy_snapshot_defaults_are_noop(self): + from diff_diff.results import _SyntheticDiDFitSnapshot + + data = _make_panel(seed=42) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = SyntheticDiD(variance_method="jackknife", seed=1).fit( + data, outcome="outcome", treatment="treated", + unit="unit", time="period", + post_periods=[5, 6, 7], + ) + + # Baseline diagnostic output with the real (fit-captured) normalization. + placebo0 = r.in_time_placebo() + sens0 = r.sensitivity_to_zeta_omega() + + # Overwrite the snapshot with a legacy one built without Y_shift / + # Y_scale — the defaults must make the two diagnostic paths produce + # the same output as the fit-captured version, because the main + # fit's Y-shift/scale choice is a no-op on a small, well-scaled + # panel (Y_shift ~ 10, Y_scale ~ O(1), so (Y - shift)/scale is just + # a shifted/scaled copy of Y). + snap = r._fit_snapshot + legacy_snap = _SyntheticDiDFitSnapshot( + Y_pre_control=np.array(snap.Y_pre_control), + Y_post_control=np.array(snap.Y_post_control), + Y_pre_treated=np.array(snap.Y_pre_treated), + Y_post_treated=np.array(snap.Y_post_treated), + control_unit_ids=list(snap.control_unit_ids), + treated_unit_ids=list(snap.treated_unit_ids), + pre_periods=list(snap.pre_periods), + post_periods=list(snap.post_periods), + w_control=snap.w_control, + w_treated=snap.w_treated, + # Defaults — no Y_shift/Y_scale captured. + ) + # Confirm the defaults are what we expect. + assert legacy_snap.Y_shift == 0.0 + assert legacy_snap.Y_scale == 1.0 + + r._fit_snapshot = legacy_snap + placebo_legacy = r.in_time_placebo() + sens_legacy = r.sensitivity_to_zeta_omega() + + # Shape and columns must match. + assert list(placebo_legacy.columns) == list(placebo0.columns) + assert list(sens_legacy.columns) == list(sens0.columns) + assert len(placebo_legacy) == len(placebo0) + assert len(sens_legacy) == len(sens0) + + class TestHeterogeneousAndRampingScale: """D-4b: the existing TestScaleEquivariance suite is affine-only (Y → a*Y + b with a single scalar a). These pathways are not covered: