From 37bb13ee025728cb3b2f805fd4706b38cd20d6e3 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sun, 19 Apr 2026 06:27:12 -0400
Subject: [PATCH 1/2] Extend PR #312 Y-normalization contract into SDID
 diagnostic methods

PR #312 centered and scaled Y once in SyntheticDiD.fit() to avoid
catastrophic cancellation in the SDID double-difference at extreme Y.
The fix did not reach the two public diagnostic methods on
SyntheticDiDResults: in_time_placebo() and sensitivity_to_zeta_omega()
both re-run Frank-Wolfe on the original-scale fit snapshot with
original-scale zetas, so at Y ~ 1e9 they reproduce the same silent
failure the main path was fixed for.

Capture Y_shift and Y_scale on the fit snapshot, apply the same
(Y - shift) / scale normalization inside the two diagnostic methods,
pass zeta / Y_scale and min_decrease derived from the normalized
noise level to the FW weight solvers, and rescale att / pre_fit_rmse
back to original-Y units before reporting. Unit-weight diagnostics
(max_unit_weight, effective_n) are scale-invariant and reported
without rescaling.

Regression coverage:
- TestDiagnosticScaleParity: att / pre_fit_rmse must scale by |a|
  across (Y -> a*Y + b) for both diagnostic methods; a no-effect
  DGP at Y ~ 1e9 must produce finite placebo atts landing within
  5 * noise_level.
- TestHeterogeneousAndRampingScale: cross-unit heterogeneous scale
  (units spanning 1e6 to 1e9) and cross-period ramping (trend
  growing 4 orders of magnitude across periods) must leave the fit
  and both diagnostic surfaces finite. These are the heterogeneity
  pathways the original TestScaleEquivariance (affine-only) suite
  did not cover.

Covered by audit axis A (numerical precision / scale fragility).
Findings D-4 and D-4b from docs/audits/silent-failures-findings.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/results.py           | 117 ++++++++++++-------
 diff_diff/synthetic_did.py     |   2 +
 docs/methodology/REGISTRY.md   |   2 +-
 tests/test_methodology_sdid.py | 208 +++++++++++++++++++++++++++++++++
 4 files changed, 284 insertions(+), 45 deletions(-)

diff --git a/diff_diff/results.py b/diff_diff/results.py
index 504a84d0..e15e9927 100644
--- a/diff_diff/results.py
+++ b/diff_diff/results.py
@@ -669,6 +669,12 @@ class _SyntheticDiDFitSnapshot:
     post_periods: List[Any]
     w_control: Optional[np.ndarray] = None
     w_treated: Optional[np.ndarray] = None
+    # Normalization constants captured during fit() so diagnostic methods can
+    # reproduce the main path's centering+scaling and avoid catastrophic
+    # cancellation on extreme-Y panels. Defaults preserve behavior for
+    # snapshots built before these fields existed.
+    Y_shift: float = 0.0
+    Y_scale: float = 1.0
 
     def __post_init__(self):
         for arr in (
@@ -1144,8 +1150,17 @@ def in_time_placebo(
                 "in_time_placebo() needs zeta_omega and zeta_lambda from the "
                 "original fit. Expected on the results object but found None."
             )
+        # Reproduce the main fit path's Y normalization (Y → (Y - shift) / scale)
+        # so Frank-Wolfe sees the same ~O(1) inputs it saw during fit() and the
+        # SDID double-difference does not suffer ~6-digit cancellation at
+        # extreme Y. See SyntheticDiD.fit() and REGISTRY.md §SyntheticDiD.
+        Y_shift = snap.Y_shift
+        Y_scale = snap.Y_scale
+        zeta_omega_n = zeta_omega / Y_scale
+        zeta_lambda_n = zeta_lambda / Y_scale
         noise_level = self.noise_level if self.noise_level is not None else 0.0
-        min_decrease = 1e-5 * noise_level if noise_level > 0 else 1e-5
+        noise_level_n = noise_level / Y_scale
+        min_decrease = 1e-5 * noise_level_n if noise_level_n > 0 else 1e-5
 
         # Build the list of (fake_period, position) pairs to iterate.
         period_to_idx = {p: i for i, p in enumerate(pre_periods)}
@@ -1195,29 +1210,29 @@ def in_time_placebo(
                 rows.append(row)
                 continue
 
-            Y_pre_c = snap.Y_pre_control[:i, :]
-            Y_post_c = snap.Y_pre_control[i:, :]
-            Y_pre_t = snap.Y_pre_treated[:i, :]
-            Y_post_t = snap.Y_pre_treated[i:, :]
+            Y_pre_c_n = (snap.Y_pre_control[:i, :] - Y_shift) / Y_scale
+            Y_post_c_n = (snap.Y_pre_control[i:, :] - Y_shift) / Y_scale
+            Y_pre_t_n = (snap.Y_pre_treated[:i, :] - Y_shift) / Y_scale
+            Y_post_t_n = (snap.Y_pre_treated[i:, :] - Y_shift) / Y_scale
 
             if snap.w_treated is not None:
                 w_t = snap.w_treated
-                y_pre_t_mean = np.average(Y_pre_t, axis=1, weights=w_t)
-                y_post_t_mean = np.average(Y_post_t, axis=1, weights=w_t)
+                y_pre_t_mean_n = np.average(Y_pre_t_n, axis=1, weights=w_t)
+                y_post_t_mean_n = np.average(Y_post_t_n, axis=1, weights=w_t)
             else:
-                y_pre_t_mean = np.mean(Y_pre_t, axis=1)
-                y_post_t_mean = np.mean(Y_post_t, axis=1)
+                y_pre_t_mean_n = np.mean(Y_pre_t_n, axis=1)
+                y_post_t_mean_n = np.mean(Y_post_t_n, axis=1)
 
             omega_fake = compute_sdid_unit_weights(
-                Y_pre_c,
-                y_pre_t_mean,
-                zeta_omega=zeta_omega,
+                Y_pre_c_n,
+                y_pre_t_mean_n,
+                zeta_omega=zeta_omega_n,
                 min_decrease=min_decrease,
             )
             lambda_fake = compute_time_weights(
-                Y_pre_c,
-                Y_post_c,
-                zeta_lambda=zeta_lambda,
+                Y_pre_c_n,
+                Y_post_c_n,
+                zeta_lambda=zeta_lambda_n,
                 min_decrease=min_decrease,
             )
 
@@ -1231,20 +1246,22 @@ def in_time_placebo(
             else:
                 omega_eff_fake = omega_fake
 
-            att_fake = compute_sdid_estimator(
-                Y_pre_c,
-                Y_post_c,
-                y_pre_t_mean,
-                y_post_t_mean,
+            att_fake_n = compute_sdid_estimator(
+                Y_pre_c_n,
+                Y_post_c_n,
+                y_pre_t_mean_n,
+                y_post_t_mean_n,
                 omega_eff_fake,
                 lambda_fake,
             )
-            synthetic_pre_fake = Y_pre_c @ omega_eff_fake
-            pre_fit = float(
-                np.sqrt(np.mean((y_pre_t_mean - synthetic_pre_fake) ** 2))
+            synthetic_pre_fake_n = Y_pre_c_n @ omega_eff_fake
+            pre_fit_n = float(
+                np.sqrt(np.mean((y_pre_t_mean_n - synthetic_pre_fake_n) ** 2))
             )
-            row["att"] = float(att_fake)
-            row["pre_fit_rmse"] = pre_fit
+            # ATT is scale-equivariant and shift-invariant in Y; RMSE is
+            # scale-equivariant. Rescale back to original-Y units.
+            row["att"] = float(att_fake_n * Y_scale)
+            row["pre_fit_rmse"] = pre_fit_n * Y_scale
             rows.append(row)
 
         return pd.DataFrame(rows)
@@ -1320,19 +1337,29 @@ def sensitivity_to_zeta_omega(
         else:
             zeta_values = [float(z) for z in zeta_grid]
 
+        # Reproduce the main fit path's Y normalization so FW sees ~O(1)
+        # inputs; see in_time_placebo() for the same pattern.
+        Y_shift = snap.Y_shift
+        Y_scale = snap.Y_scale
         noise_level = self.noise_level if self.noise_level is not None else 0.0
-        min_decrease = 1e-5 * noise_level if noise_level > 0 else 1e-5
+        noise_level_n = noise_level / Y_scale
+        min_decrease = 1e-5 * noise_level_n if noise_level_n > 0 else 1e-5
+
+        Y_pre_control_n = (snap.Y_pre_control - Y_shift) / Y_scale
+        Y_post_control_n = (snap.Y_post_control - Y_shift) / Y_scale
+        Y_pre_treated_n = (snap.Y_pre_treated - Y_shift) / Y_scale
+        Y_post_treated_n = (snap.Y_post_treated - Y_shift) / Y_scale
 
         if snap.w_treated is not None:
-            y_pre_t_mean = np.average(
-                snap.Y_pre_treated, axis=1, weights=snap.w_treated
+            y_pre_t_mean_n = np.average(
+                Y_pre_treated_n, axis=1, weights=snap.w_treated
             )
-            y_post_t_mean = np.average(
-                snap.Y_post_treated, axis=1, weights=snap.w_treated
+            y_post_t_mean_n = np.average(
+                Y_post_treated_n, axis=1, weights=snap.w_treated
             )
         else:
-            y_pre_t_mean = np.mean(snap.Y_pre_treated, axis=1)
-            y_post_t_mean = np.mean(snap.Y_post_treated, axis=1)
+            y_pre_t_mean_n = np.mean(Y_pre_treated_n, axis=1)
+            y_post_t_mean_n = np.mean(Y_post_treated_n, axis=1)
 
         columns = [
             "zeta_omega",
@@ -1348,9 +1375,9 @@ def sensitivity_to_zeta_omega(
         rows: List[Dict[str, Any]] = []
         for z in zeta_values:
             omega_fake = compute_sdid_unit_weights(
-                snap.Y_pre_control,
-                y_pre_t_mean,
-                zeta_omega=z,
+                Y_pre_control_n,
+                y_pre_t_mean_n,
+                zeta_omega=z / Y_scale,
                 min_decrease=min_decrease,
             )
             if snap.w_control is not None:
@@ -1371,22 +1398,24 @@ def sensitivity_to_zeta_omega(
             else:
                 omega_eff = omega_fake
 
-            att = compute_sdid_estimator(
-                snap.Y_pre_control,
-                snap.Y_post_control,
-                y_pre_t_mean,
-                y_post_t_mean,
+            att_n = compute_sdid_estimator(
+                Y_pre_control_n,
+                Y_post_control_n,
+                y_pre_t_mean_n,
+                y_post_t_mean_n,
                 omega_eff,
                 time_weights,
             )
-            synthetic_pre = snap.Y_pre_control @ omega_eff
-            pre_fit = float(np.sqrt(np.mean((y_pre_t_mean - synthetic_pre) ** 2)))
+            synthetic_pre_n = Y_pre_control_n @ omega_eff
+            pre_fit_n = float(np.sqrt(np.mean((y_pre_t_mean_n - synthetic_pre_n) ** 2)))
             herf = float(np.sum(omega_eff ** 2))
             rows.append(
                 {
                     "zeta_omega": z,
-                    "att": float(att),
-                    "pre_fit_rmse": pre_fit,
+                    # Unit weights are scale-invariant; ATT and RMSE are
+                    # scale-equivariant. Report original-Y units.
+                    "att": float(att_n * Y_scale),
+                    "pre_fit_rmse": pre_fit_n * Y_scale,
                     "max_unit_weight": float(np.max(omega_eff)),
                     "effective_n": float("nan") if herf == 0 else 1.0 / herf,
                 }
diff --git a/diff_diff/synthetic_did.py b/diff_diff/synthetic_did.py
index e3f43740..bddaacb9 100644
--- a/diff_diff/synthetic_did.py
+++ b/diff_diff/synthetic_did.py
@@ -667,6 +667,8 @@ def fit(  # type: ignore[override]
             post_periods=list(post_periods),
             w_control=w_control,
             w_treated=w_treated,
+            Y_shift=Y_shift,
+            Y_scale=Y_scale,
         )
 
         # Freeze the public diagnostic arrays so mutation via the results
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 31ba4f72..810b8fc7 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -1519,7 +1519,7 @@ Convergence criterion: stop when objective decrease < min_decrease² (default mi
 - **Jackknife with non-finite LOO estimate**: Returns NaN SE. Unlike bootstrap/placebo, jackknife is deterministic and cannot skip failed iterations; NaN propagates through `var()` (matches R behavior).
 - **Jackknife with survey weights**: Guards on effective positive support (omega * w_control > 0 and w_treated > 0) after composition, not raw FW counts. Returns NaN SE if fewer than 2 effective controls or 2 positive-weight treated units. Per-iteration zero-sum guards return NaN for individual LOO iterations when remaining composed weights sum to zero.
 - **Note:** Survey support: weights, strata, PSU, and FPC are all supported. Full-design surveys use Rao-Wu rescaled bootstrap (Phase 6); non-bootstrap variance methods (`variance_method="placebo"` or `"jackknife"`) require weights-only (strata/PSU/FPC require bootstrap). Both sides weighted per WLS regression interpretation: treated-side means are survey-weighted (Frank-Wolfe target and ATT formula); control-side synthetic weights are composed with survey weights post-optimization (ω_eff = ω * w_co, renormalized). Frank-Wolfe optimization itself is unweighted — survey importance enters after trajectory-matching. Covariate residualization uses WLS with survey weights. Placebo, jackknife, and bootstrap SE preserve survey weights on both sides.
-- **Note:** Internal Y normalization. Before weight optimization, the estimator, and variance procedures, `fit()` centers Y by `mean(Y_pre_control)` and scales by `std(Y_pre_control)`; `Y_scale` falls back to `1.0` when std is non-finite or below `1e-12 * max(|mean|, 1)`. Auto-regularization and `noise_level` are computed on normalized Y; user-supplied `zeta_omega` / `zeta_lambda` are divided by `Y_scale` internally for Frank-Wolfe. τ, SE, CI, the placebo/bootstrap/jackknife effect vectors, `results_.noise_level`, and `results_.zeta_omega` / `results_.zeta_lambda` are all reported on the user's original outcome scale (user-supplied zetas are echoed back exactly to avoid float roundoff). Mathematically a no-op — τ is location-invariant and scale-equivariant, and FW weights are invariant under `(Y, ζ) → (Y/s, ζ/s)` — but prevents catastrophic cancellation in the SDID double-difference when outcomes span millions-to-billions (see synth-inference/synthdid#71 for the R-package version of this issue). Normalization constants are derived from controls' pre-period only so the reference is unaffected by treatment. Scope: `in_time_placebo()` and `sensitivity_to_zeta_omega()` continue to run on the stored original-scale snapshot with original-scale zetas (preserving pre-fix behavior); extending normalization into those diagnostic paths is tracked separately.
+- **Note:** Internal Y normalization. Before weight optimization, the estimator, and variance procedures, `fit()` centers Y by `mean(Y_pre_control)` and scales by `std(Y_pre_control)`; `Y_scale` falls back to `1.0` when std is non-finite or below `1e-12 * max(|mean|, 1)`. Auto-regularization and `noise_level` are computed on normalized Y; user-supplied `zeta_omega` / `zeta_lambda` are divided by `Y_scale` internally for Frank-Wolfe. τ, SE, CI, the placebo/bootstrap/jackknife effect vectors, `results_.noise_level`, and `results_.zeta_omega` / `results_.zeta_lambda` are all reported on the user's original outcome scale (user-supplied zetas are echoed back exactly to avoid float roundoff). Mathematically a no-op — τ is location-invariant and scale-equivariant, and FW weights are invariant under `(Y, ζ) → (Y/s, ζ/s)` — but prevents catastrophic cancellation in the SDID double-difference when outcomes span millions-to-billions (see synth-inference/synthdid#71 for the R-package version of this issue). Normalization constants are derived from controls' pre-period only so the reference is unaffected by treatment. `in_time_placebo()` and `sensitivity_to_zeta_omega()` reuse the exact same `Y_shift` / `Y_scale` captured on the fit snapshot: they normalize the re-sliced arrays before re-running Frank-Wolfe, pass `zeta / Y_scale` to the weight solvers, and rescale the returned `att` and `pre_fit_rmse` by `Y_scale` before reporting; unit-weight diagnostics (`max_unit_weight`, `effective_n`) are scale-invariant and reported directly.
 
 *Validation diagnostics (post-fit methods on `SyntheticDiDResults`):*
 
diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py
index c7a77545..87f61acc 100644
--- a/tests/test_methodology_sdid.py
+++ b/tests/test_methodology_sdid.py
@@ -2296,3 +2296,211 @@ def test_detects_true_effect_at_extreme_scale(self, variance_method):
                 f"Effect at Y~1e9 must reject null; p_value={r.p_value} "
                 f"(variance_method={variance_method})"
             )
+
+
+class TestDiagnosticScaleParity:
+    """Post-PR #312: in_time_placebo() and sensitivity_to_zeta_omega() must
+    inherit the same original-scale normalization contract that the main fit
+    path uses. Both diagnostics re-run Frank-Wolfe on the stored fit-snapshot
+    arrays, so at extreme Y they previously re-created the catastrophic
+    cancellation PR #312 fixed on the main path (audit finding D-4)."""
+
+    _SCALES = [(1.0, 0.0), (1e6, 1e9), (1e9, -1e6)]
+
+    @staticmethod
+    def _rescale(df, a, b):
+        out = df.copy()
+        out["outcome"] = a * out["outcome"] + b
+        return out
+
+    @staticmethod
+    def _fit(data, seed=1):
+        return SyntheticDiD(variance_method="jackknife", seed=seed).fit(
+            data, outcome="outcome", treatment="treated",
+            unit="unit", time="period",
+            post_periods=[5, 6, 7],
+        )
+
+    def test_in_time_placebo_scale_equivariance(self):
+        """in_time_placebo att/pre_fit_rmse must scale by |a| across
+        (Y → a*Y + b). Pre-fix at extreme scale the diagnostic re-ran FW on
+        original-scale snapshot arrays and cancellation corrupted att."""
+        data = _make_panel(seed=42)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r0 = self._fit(data)
+        placebo0 = r0.in_time_placebo()
+        fake_periods = placebo0["fake_treatment_period"].tolist()
+
+        for a, b in self._SCALES:
+            scaled = self._rescale(data, a, b)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", UserWarning)
+                r = self._fit(scaled)
+            placebo = r.in_time_placebo(fake_treatment_periods=fake_periods)
+
+            assert list(placebo["fake_treatment_period"]) == fake_periods
+            for row0, row in zip(placebo0.to_dict("records"),
+                                 placebo.to_dict("records")):
+                if np.isnan(row0["att"]):
+                    assert np.isnan(row["att"]), f"att at (a={a}, b={b})"
+                    assert np.isnan(row["pre_fit_rmse"])
+                    continue
+                assert row["att"] / a == pytest.approx(row0["att"], rel=1e-6), (
+                    f"att at (a={a}, b={b}), "
+                    f"fake_period={row0['fake_treatment_period']}"
+                )
+                assert row["pre_fit_rmse"] / abs(a) == pytest.approx(
+                    row0["pre_fit_rmse"], rel=1e-6
+                ), (
+                    f"pre_fit_rmse at (a={a}, b={b}), "
+                    f"fake_period={row0['fake_treatment_period']}"
+                )
+
+    def test_sensitivity_to_zeta_omega_scale_equivariance(self):
+        """sensitivity_to_zeta_omega att/pre_fit_rmse must scale by |a|;
+        unit-weight diagnostics (max_unit_weight, effective_n) must be
+        scale-invariant."""
+        data = _make_panel(seed=42)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r0 = self._fit(data)
+        sens0 = r0.sensitivity_to_zeta_omega()
+        zeta_grid = sens0["zeta_omega"].tolist()
+
+        for a, b in self._SCALES:
+            scaled = self._rescale(data, a, b)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", UserWarning)
+                r = self._fit(scaled)
+            sens = r.sensitivity_to_zeta_omega(
+                zeta_grid=[a * z for z in zeta_grid]
+            )
+            for row0, row in zip(sens0.to_dict("records"),
+                                 sens.to_dict("records")):
+                assert row["att"] / a == pytest.approx(row0["att"], rel=1e-6), (
+                    f"att at (a={a}, b={b}), zeta={row0['zeta_omega']}"
+                )
+                assert row["pre_fit_rmse"] / abs(a) == pytest.approx(
+                    row0["pre_fit_rmse"], rel=1e-6
+                )
+                # omega-derived diagnostics are scale-invariant.
+                assert row["max_unit_weight"] == pytest.approx(
+                    row0["max_unit_weight"], rel=1e-6
+                )
+                assert row["effective_n"] == pytest.approx(
+                    row0["effective_n"], rel=1e-6
+                )
+
+    def test_in_time_placebo_detectable_at_extreme_scale(self):
+        """Pre-fix regression: at Y~1e9 the placebo re-fit corrupted ATTs via
+        cancellation so diagnostic numbers were garbage. Post-fix, all
+        placebo rows on a zero-effect DGP must be finite and at least one
+        must land within 5*noise_level in original-Y units."""
+        rng = np.random.default_rng(0)
+        n_control, n_treated, n_pre, n_post = 20, 3, 7, 3
+        baseline_level = 1e9
+        rows = []
+        for unit in range(n_control + n_treated):
+            unit_fe = rng.normal(0, 2e6)
+            for t in range(n_pre + n_post):
+                y = baseline_level + unit_fe + t * 3e5 + rng.normal(0, 5e5)
+                rows.append({"unit": unit, "period": t,
+                             "treated": int(unit >= n_control), "outcome": y})
+        data = pd.DataFrame(rows)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r = self._fit(data, seed=7)
+
+        placebo = r.in_time_placebo()
+        finite = placebo.dropna(subset=["att"])
+        assert len(finite) > 0
+        assert np.all(np.isfinite(finite["att"].values))
+        assert np.all(np.isfinite(finite["pre_fit_rmse"].values))
+        assert (np.abs(finite["att"]) < 5.0 * r.noise_level).any(), (
+            f"At least one placebo row should be within 5*noise_level "
+            f"({5.0 * r.noise_level}); got atts {finite['att'].tolist()}"
+        )
+
+
+class TestHeterogeneousAndRampingScale:
+    """D-4b: the existing TestScaleEquivariance suite is affine-only
+    (Y → a*Y + b with a single scalar a). These pathways are not covered:
+
+    - Cross-unit heterogeneous scale: different units span 1e6 to 1e9.
+    - Cross-period ramping: baseline trend growing several orders of
+      magnitude across periods.
+
+    Both were candidate triggers for the original SDID silent-failure report
+    and must stay detectable after any future refactor of the normalization
+    contract."""
+
+    @staticmethod
+    def _fit(data, seed=1):
+        return SyntheticDiD(variance_method="jackknife", seed=seed).fit(
+            data, outcome="outcome", treatment="treated",
+            unit="unit", time="period",
+            post_periods=[5, 6, 7],
+        )
+
+    def test_cross_unit_heterogeneous_scale(self):
+        """Units spanning 1e6 to 1e9 must still produce finite fit and
+        diagnostic output. Heterogeneous levels historically triggered the
+        cancellation pathway even at modest Y; this is a regression trap
+        for future refactors of the normalization contract."""
+        rng = np.random.default_rng(11)
+        n_control, n_treated, n_pre, n_post = 20, 3, 6, 3
+        rows = []
+        for unit in range(n_control + n_treated):
+            unit_level = 10 ** rng.uniform(6, 9)
+            is_treated = unit >= n_control
+            for t in range(n_pre + n_post):
+                y = unit_level * (1 + 0.02 * t) + rng.normal(0, unit_level * 0.01)
+                if is_treated and t >= n_pre:
+                    y += 0.05 * unit_level
+                rows.append({"unit": unit, "period": t,
+                             "treated": int(is_treated), "outcome": y})
+        data = pd.DataFrame(rows)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r = self._fit(data)
+
+        assert np.isfinite(r.att) and np.isfinite(r.se)
+        assert r.se > 0
+        placebo = r.in_time_placebo()
+        assert np.all(np.isfinite(placebo["att"].dropna()))
+        assert np.all(np.isfinite(placebo["pre_fit_rmse"].dropna()))
+        sens = r.sensitivity_to_zeta_omega()
+        assert np.all(np.isfinite(sens["att"]))
+        assert np.all(np.isfinite(sens["pre_fit_rmse"]))
+
+    def test_cross_period_ramping_trend(self):
+        """A strong cross-period trend (baseline level multiplies across
+        periods) must still produce a detectable, finite ATT and finite
+        diagnostic output."""
+        rng = np.random.default_rng(13)
+        n_control, n_treated, n_pre, n_post = 20, 3, 6, 3
+        rows = []
+        for unit in range(n_control + n_treated):
+            unit_fe = rng.normal(0, 1.0)
+            is_treated = unit >= n_control
+            for t in range(n_pre + n_post):
+                trend = 10 ** (5 + 0.4 * t)
+                y = trend + unit_fe * trend * 0.01 + rng.normal(0, trend * 0.005)
+                if is_treated and t >= n_pre:
+                    y += 0.01 * trend
+                rows.append({"unit": unit, "period": t,
+                             "treated": int(is_treated), "outcome": y})
+        data = pd.DataFrame(rows)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r = self._fit(data)
+
+        assert np.isfinite(r.att) and np.isfinite(r.se)
+        assert r.se > 0
+        placebo = r.in_time_placebo()
+        assert np.all(np.isfinite(placebo["att"].dropna()))
+        assert np.all(np.isfinite(placebo["pre_fit_rmse"].dropna()))
+        sens = r.sensitivity_to_zeta_omega()
+        assert np.all(np.isfinite(sens["att"]))
+        assert np.all(np.isfinite(sens["pre_fit_rmse"]))

From b72c0b9ee51ac65b746e893b8ffd8b76dc72e17e Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sun, 19 Apr 2026 06:29:54 -0400
Subject: [PATCH 2/2] Address local AI review P3: backward-compat test for
 legacy snapshots

Pre-fix _SyntheticDiDFitSnapshot objects don't carry Y_shift / Y_scale
fields. After this PR those fields default to (0.0, 1.0), which makes
the new normalization path a pure no-op on the legacy snapshot.

Add a regression test that overwrites results_._fit_snapshot with a
manually-constructed snapshot using only the pre-PR fields, then
confirms both in_time_placebo() and sensitivity_to_zeta_omega()
preserve schema and row count. Locks in the backward-compatibility
contract so future refactors don't accidentally tighten the
normalization path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_methodology_sdid.py | 57 ++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py
index 87f61acc..20b6f203 100644
--- a/tests/test_methodology_sdid.py
+++ b/tests/test_methodology_sdid.py
@@ -2423,6 +2423,63 @@ def test_in_time_placebo_detectable_at_extreme_scale(self):
         )
 
 
+class TestDiagnosticSnapshotBackwardCompat:
+    """Locks in the backward-compatibility contract for legacy
+    _SyntheticDiDFitSnapshot objects that pre-date Y_shift/Y_scale. Their
+    defaults (0.0, 1.0) must make the new normalization a pure no-op so
+    older cached snapshots still drive diagnostic refits unchanged."""
+
+    def test_legacy_snapshot_defaults_are_noop(self):
+        from diff_diff.results import _SyntheticDiDFitSnapshot
+
+        data = _make_panel(seed=42)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            r = SyntheticDiD(variance_method="jackknife", seed=1).fit(
+                data, outcome="outcome", treatment="treated",
+                unit="unit", time="period",
+                post_periods=[5, 6, 7],
+            )
+
+        # Baseline diagnostic output with the real (fit-captured) normalization.
+        placebo0 = r.in_time_placebo()
+        sens0 = r.sensitivity_to_zeta_omega()
+
+        # Overwrite the snapshot with a legacy one built without Y_shift /
+        # Y_scale — the defaults must make the two diagnostic paths produce
+        # the same output as the fit-captured version, because the main
+        # fit's Y-shift/scale choice is a no-op on a small, well-scaled
+        # panel (Y_shift ~ 10, Y_scale ~ O(1), so (Y - shift)/scale is just
+        # a shifted/scaled copy of Y).
+        snap = r._fit_snapshot
+        legacy_snap = _SyntheticDiDFitSnapshot(
+            Y_pre_control=np.array(snap.Y_pre_control),
+            Y_post_control=np.array(snap.Y_post_control),
+            Y_pre_treated=np.array(snap.Y_pre_treated),
+            Y_post_treated=np.array(snap.Y_post_treated),
+            control_unit_ids=list(snap.control_unit_ids),
+            treated_unit_ids=list(snap.treated_unit_ids),
+            pre_periods=list(snap.pre_periods),
+            post_periods=list(snap.post_periods),
+            w_control=snap.w_control,
+            w_treated=snap.w_treated,
+            # Defaults — no Y_shift/Y_scale captured.
+        )
+        # Confirm the defaults are what we expect.
+        assert legacy_snap.Y_shift == 0.0
+        assert legacy_snap.Y_scale == 1.0
+
+        r._fit_snapshot = legacy_snap
+        placebo_legacy = r.in_time_placebo()
+        sens_legacy = r.sensitivity_to_zeta_omega()
+
+        # Shape and columns must match.
+        assert list(placebo_legacy.columns) == list(placebo0.columns)
+        assert list(sens_legacy.columns) == list(sens0.columns)
+        assert len(placebo_legacy) == len(placebo0)
+        assert len(sens_legacy) == len(sens0)
+
+
 class TestHeterogeneousAndRampingScale:
     """D-4b: the existing TestScaleEquivariance suite is affine-only
     (Y → a*Y + b with a single scalar a). These pathways are not covered: