diff --git a/diff_diff/synthetic_did.py b/diff_diff/synthetic_did.py index ca8baf07..97976f7c 100644 --- a/diff_diff/synthetic_did.py +++ b/diff_diff/synthetic_did.py @@ -416,43 +416,92 @@ def fit( # type: ignore[override] ) ) - # Compute auto-regularization (or use user overrides) - auto_zeta_omega, auto_zeta_lambda = _compute_regularization( - Y_pre_control, len(treated_units), len(post_periods) + # --- Y normalization --------------------------------------------- + # τ is location-invariant and scale-equivariant in Y. Normalizing Y + # once before weight optimization, the estimator, and the variance + # procedures (and rescaling τ/SE/CI/effects by Y_scale at the end) + # is mathematically a no-op but prevents ~6-digit precision loss in + # the SDID double-difference when outcomes span millions-to-billions. + # Normalization constants come from controls' pre-period only so the + # reference is unaffected by treatment. See REGISTRY.md §SyntheticDiD + # edge cases and synth-inference/synthdid#71 for R's version. + Y_shift = float(np.mean(Y_pre_control)) + Y_scale_raw = float(np.std(Y_pre_control)) + # Relative floor: avoid amplifying roundoff when std is tiny but + # nonzero (near-constant Y_pre_control). Fall back to 1.0 in that + # case and on non-finite std. + _scale_floor = 1e-12 * max(abs(Y_shift), 1.0) + Y_scale = ( + Y_scale_raw + if np.isfinite(Y_scale_raw) and Y_scale_raw > _scale_floor + else 1.0 + ) + Y_pre_control_n = (Y_pre_control - Y_shift) / Y_scale + Y_post_control_n = (Y_post_control - Y_shift) / Y_scale + Y_pre_treated_n = (Y_pre_treated - Y_shift) / Y_scale + Y_post_treated_n = (Y_post_treated - Y_shift) / Y_scale + + # Auto-regularization on normalized Y. FW's argmin is invariant under + # (Y, ζ) -> (Y/s, ζ/s); auto-zetas computed on Y_n are already on the + # normalized scale. User-supplied zetas are in original-Y units and + # are divided by Y_scale for internal FW use. Original-scale values + # are stored on results_ / self so diagnostic methods (in_time_placebo, + # sensitivity_to_zeta_omega) — which operate on the stored original- + # scale fit snapshot — see the same zeta the user specified. + auto_zeta_omega_n, auto_zeta_lambda_n = _compute_regularization( + Y_pre_control_n, len(treated_units), len(post_periods) + ) + zeta_omega_n = ( + self.zeta_omega / Y_scale + if self.zeta_omega is not None + else auto_zeta_omega_n + ) + zeta_lambda_n = ( + self.zeta_lambda / Y_scale + if self.zeta_lambda is not None + else auto_zeta_lambda_n + ) + # Report the user-supplied value exactly (no roundoff from /*Y_scale + # roundtrip); report auto-zeta rescaled to original Y units. + zeta_omega = ( + self.zeta_omega if self.zeta_omega is not None else auto_zeta_omega_n * Y_scale + ) + zeta_lambda = ( + self.zeta_lambda if self.zeta_lambda is not None else auto_zeta_lambda_n * Y_scale ) - zeta_omega = self.zeta_omega if self.zeta_omega is not None else auto_zeta_omega - zeta_lambda = self.zeta_lambda if self.zeta_lambda is not None else auto_zeta_lambda - # Store noise level for diagnostics + # Store noise level for diagnostics (reported on original Y scale). from diff_diff.utils import _compute_noise_level - noise_level = _compute_noise_level(Y_pre_control) + noise_level_n = _compute_noise_level(Y_pre_control_n) + noise_level = noise_level_n * Y_scale - # Data-dependent convergence threshold (matches R's 1e-5 * noise.level). - # Floor of 1e-5 when noise_level == 0: R would use 0.0, causing FW to - # run all max_iter iterations. The result is equivalent (zero-noise - # data has no variation to optimize), but the floor enables early stop. - min_decrease = 1e-5 * noise_level if noise_level > 0 else 1e-5 + # Data-dependent convergence threshold (matches R's 1e-5 * noise.level), + # evaluated on normalized Y since FW operates on normalized Y. Floor of + # 1e-5 when noise is zero: R would use 0.0, causing FW to run all + # max_iter iterations; the floor enables early stop on zero-variation + # inputs without changing the optimum. + min_decrease = 1e-5 * noise_level_n if noise_level_n > 0 else 1e-5 - # Compute unit weights (Frank-Wolfe with sparsification) - # Survey weights enter via the treated mean target + # Compute unit weights (Frank-Wolfe with sparsification) on normalized Y. + # Survey weights enter via the treated mean target. if w_treated is not None: - Y_pre_treated_mean = np.average(Y_pre_treated, axis=1, weights=w_treated) + Y_pre_treated_mean_n = np.average(Y_pre_treated_n, axis=1, weights=w_treated) else: - Y_pre_treated_mean = np.mean(Y_pre_treated, axis=1) + Y_pre_treated_mean_n = np.mean(Y_pre_treated_n, axis=1) unit_weights = compute_sdid_unit_weights( - Y_pre_control, - Y_pre_treated_mean, - zeta_omega=zeta_omega, + Y_pre_control_n, + Y_pre_treated_mean_n, + zeta_omega=zeta_omega_n, min_decrease=min_decrease, ) - # Compute time weights (Frank-Wolfe on collapsed form) + # Compute time weights (Frank-Wolfe on collapsed form) on normalized Y. time_weights = compute_time_weights( - Y_pre_control, - Y_post_control, - zeta_lambda=zeta_lambda, + Y_pre_control_n, + Y_post_control_n, + zeta_lambda=zeta_lambda_n, min_decrease=min_decrease, ) @@ -465,23 +514,32 @@ def fit( # type: ignore[override] else: omega_eff = unit_weights - # Compute SDID estimate + # Compute SDID estimate on normalized Y, then rescale to original units. if w_treated is not None: - Y_post_treated_mean = np.average(Y_post_treated, axis=1, weights=w_treated) + Y_post_treated_mean_n = np.average( + Y_post_treated_n, axis=1, weights=w_treated + ) else: - Y_post_treated_mean = np.mean(Y_post_treated, axis=1) + Y_post_treated_mean_n = np.mean(Y_post_treated_n, axis=1) - att = compute_sdid_estimator( - Y_pre_control, - Y_post_control, - Y_pre_treated_mean, - Y_post_treated_mean, + att_n = compute_sdid_estimator( + Y_pre_control_n, + Y_post_control_n, + Y_pre_treated_mean_n, + Y_post_treated_mean_n, omega_eff, time_weights, ) + att = att_n * Y_scale - # Compute pre-treatment fit (RMSE) using composed weights. - # Retain trajectories on results for plotting / custom diagnostics. + # Recover original-scale treated means for diagnostics / trajectories. + Y_pre_treated_mean = Y_pre_treated_mean_n * Y_scale + Y_shift + Y_post_treated_mean = Y_post_treated_mean_n * Y_scale + Y_shift + + # Compute pre-treatment fit (RMSE) using composed weights on the + # original Y (user-visible scale). omega_eff is a simplex — applies + # cleanly to any linear rescale of Y — so trajectories live on the + # original outcome scale for plotting and the poor-fit warning. synthetic_pre_trajectory = Y_pre_control @ omega_eff synthetic_post_trajectory = Y_post_control @ omega_eff pre_fit_rmse = np.sqrt( @@ -510,49 +568,57 @@ def fit( # type: ignore[override] treated_pre_trajectory = Y_pre_treated_mean treated_post_trajectory = Y_post_treated_mean - # Compute standard errors based on variance_method + # Compute standard errors on normalized Y, rescale to original units. + # Variance procedures resample / permute indices (independent of Y + # values) so RNG streams stay aligned across scales. if self.variance_method == "bootstrap": - se, bootstrap_estimates = self._bootstrap_se( - Y_pre_control, - Y_post_control, - Y_pre_treated, - Y_post_treated, + se_n, bootstrap_estimates_n = self._bootstrap_se( + Y_pre_control_n, + Y_post_control_n, + Y_pre_treated_n, + Y_post_treated_n, unit_weights, time_weights, w_treated=w_treated, w_control=w_control, resolved_survey=_unit_resolved_survey, ) - placebo_effects = bootstrap_estimates + se = se_n * Y_scale + placebo_effects = np.asarray(bootstrap_estimates_n) * Y_scale inference_method = "bootstrap" elif self.variance_method == "jackknife": # Fixed-weight jackknife (R's synthdid Algorithm 3) - se, jackknife_estimates = self._jackknife_se( - Y_pre_control, - Y_post_control, - Y_pre_treated, - Y_post_treated, + se_n, jackknife_estimates_n = self._jackknife_se( + Y_pre_control_n, + Y_post_control_n, + Y_pre_treated_n, + Y_post_treated_n, unit_weights, time_weights, w_treated=w_treated, w_control=w_control, ) - placebo_effects = jackknife_estimates + se = se_n * Y_scale + placebo_effects = np.asarray(jackknife_estimates_n) * Y_scale inference_method = "jackknife" else: - # Use placebo-based variance (R's synthdid Algorithm 4) - se, placebo_effects = self._placebo_variance_se( - Y_pre_control, - Y_post_control, - Y_pre_treated_mean, - Y_post_treated_mean, + # Use placebo-based variance (R's synthdid Algorithm 4). + # Placebo re-estimates ω, λ inside the loop; it must receive the + # normalized zetas and operate on normalized Y. + se_n, placebo_effects_n = self._placebo_variance_se( + Y_pre_control_n, + Y_post_control_n, + Y_pre_treated_mean_n, + Y_post_treated_mean_n, n_treated=len(treated_units), - zeta_omega=zeta_omega, - zeta_lambda=zeta_lambda, + zeta_omega=zeta_omega_n, + zeta_lambda=zeta_lambda_n, min_decrease=min_decrease, replications=self.n_bootstrap, w_control=w_control, ) + se = se_n * Y_scale + placebo_effects = np.asarray(placebo_effects_n) * Y_scale inference_method = "placebo" # Compute test statistics diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 0bf3c5de..5c0cd045 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -1515,6 +1515,7 @@ Convergence criterion: stop when objective decrease < min_decrease² (default mi - **Jackknife with non-finite LOO estimate**: Returns NaN SE. Unlike bootstrap/placebo, jackknife is deterministic and cannot skip failed iterations; NaN propagates through `var()` (matches R behavior). - **Jackknife with survey weights**: Guards on effective positive support (omega * w_control > 0 and w_treated > 0) after composition, not raw FW counts. Returns NaN SE if fewer than 2 effective controls or 2 positive-weight treated units. Per-iteration zero-sum guards return NaN for individual LOO iterations when remaining composed weights sum to zero. - **Note:** Survey support: weights, strata, PSU, and FPC are all supported. Full-design surveys use Rao-Wu rescaled bootstrap (Phase 6); non-bootstrap variance methods (`variance_method="placebo"` or `"jackknife"`) require weights-only (strata/PSU/FPC require bootstrap). Both sides weighted per WLS regression interpretation: treated-side means are survey-weighted (Frank-Wolfe target and ATT formula); control-side synthetic weights are composed with survey weights post-optimization (ω_eff = ω * w_co, renormalized). Frank-Wolfe optimization itself is unweighted — survey importance enters after trajectory-matching. Covariate residualization uses WLS with survey weights. Placebo, jackknife, and bootstrap SE preserve survey weights on both sides. +- **Note:** Internal Y normalization. Before weight optimization, the estimator, and variance procedures, `fit()` centers Y by `mean(Y_pre_control)` and scales by `std(Y_pre_control)`; `Y_scale` falls back to `1.0` when std is non-finite or below `1e-12 * max(|mean|, 1)`. Auto-regularization and `noise_level` are computed on normalized Y; user-supplied `zeta_omega` / `zeta_lambda` are divided by `Y_scale` internally for Frank-Wolfe. τ, SE, CI, the placebo/bootstrap/jackknife effect vectors, `results_.noise_level`, and `results_.zeta_omega` / `results_.zeta_lambda` are all reported on the user's original outcome scale (user-supplied zetas are echoed back exactly to avoid float roundoff). Mathematically a no-op — τ is location-invariant and scale-equivariant, and FW weights are invariant under `(Y, ζ) → (Y/s, ζ/s)` — but prevents catastrophic cancellation in the SDID double-difference when outcomes span millions-to-billions (see synth-inference/synthdid#71 for the R-package version of this issue). Normalization constants are derived from controls' pre-period only so the reference is unaffected by treatment. Scope: `in_time_placebo()` and `sensitivity_to_zeta_omega()` continue to run on the stored original-scale snapshot with original-scale zetas (preserving pre-fix behavior); extending normalization into those diagnostic paths is tracked separately. *Validation diagnostics (post-fit methods on `SyntheticDiDResults`):* diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py index 37ad91f6..36b8162a 100644 --- a/tests/test_methodology_sdid.py +++ b/tests/test_methodology_sdid.py @@ -2100,3 +2100,146 @@ def test_asdict_excludes_internal_diagnostic_state(self): assert "_fit_snapshot" not in d assert "_loo_unit_ids" not in d assert "_loo_roles" not in d + + +# ============================================================================= +# Scale equivariance — internal Y normalization prevents catastrophic +# cancellation when outcomes are on extreme scales (millions-to-billions). +# τ is location-invariant and scale-equivariant, so fitting on ``a*Y + b`` +# should produce τ/SE that scale exactly with ``a`` and leave p-values +# unchanged. On normally-scaled data the fix must be a numerical no-op. +# ============================================================================= + + +class TestScaleEquivariance: + """SDID is location-invariant and scale-equivariant in Y.""" + + # Hard-coded baselines captured pre-fix on a well-scaled panel. If these + # drift the fix is not a true no-op on normal data and review is warranted. + _BASELINE = { + "placebo": (4.603349837478791, 0.29385822261006445, 0.004975124378109453, 200), + "bootstrap": (4.603349837478791, 0.1589472532935243, 0.4869109947643979, 191), + "jackknife": (4.603349837478791, 0.19908075946622925, 2.716551077849484e-118, 23), + } + + # (a, b) pairs. Includes extreme scales where pre-fix SDID loses + # ~6 mantissa digits in the double-difference subtraction. + _SCALES = [(1e-6, 0.0), (1.0, 0.0), (1e6, 1e9), (1e9, -1e6), (1.0, 1e9)] + + @staticmethod + def _rescale(df, a, b): + out = df.copy() + out["outcome"] = a * out["outcome"] + b + return out + + @staticmethod + def _fit(data, variance_method, *, seed=1, n_bootstrap=200): + return SyntheticDiD( + variance_method=variance_method, n_bootstrap=n_bootstrap, seed=seed + ).fit( + data, + outcome="outcome", + treatment="treated", + unit="unit", + time="period", + post_periods=[5, 6, 7], + ) + + @pytest.mark.parametrize("variance_method", ["placebo", "bootstrap", "jackknife"]) + def test_baseline_parity_small_scale(self, variance_method): + """Existing-fixture results match pre-fix literals — guards against + drift; a true no-op should hit float epsilon relative to baseline.""" + att0, se0, p0, n0 = self._BASELINE[variance_method] + data = _make_panel(seed=42) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = self._fit(data, variance_method) + assert r.att == pytest.approx(att0, rel=1e-8) + assert r.se == pytest.approx(se0, rel=1e-8) + assert r.p_value == pytest.approx(p0, rel=1e-8) + assert len(r.placebo_effects) == n0 + + @pytest.mark.parametrize("variance_method", ["placebo", "bootstrap", "jackknife"]) + def test_scale_equivariance(self, variance_method): + """τ/a, SE/|a|, p-value, and n_successful must be invariant under + (Y → a*Y + b) across ~15 orders of magnitude.""" + data = _make_panel(seed=42) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r0 = self._fit(data, variance_method) + att0, se0, p0 = r0.att, r0.se, r0.p_value + n0 = len(r0.placebo_effects) + noise0 = r0.noise_level + zeta_omega0 = r0.zeta_omega + + for a, b in self._SCALES: + scaled = self._rescale(data, a, b) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = self._fit(scaled, variance_method) + # Variance-method success count must be identical; divergence + # would shift the empirical p-value floor 1/(n+1). + assert len(r.placebo_effects) == n0, ( + f"(a={a}, b={b}) yielded {len(r.placebo_effects)} effects, " + f"baseline had {n0}" + ) + assert r.att / a == pytest.approx(att0, rel=1e-8), f"att failed at a={a}, b={b}" + assert r.se / abs(a) == pytest.approx(se0, rel=1e-6), f"se failed at a={a}, b={b}" + assert r.p_value == pytest.approx(p0, rel=1e-6), f"p failed at a={a}, b={b}" + # Reported diagnostics scale with Y just like SE (noise_level, + # zeta_omega are both in outcome units). + assert r.noise_level / abs(a) == pytest.approx(noise0, rel=1e-8) + assert r.zeta_omega / abs(a) == pytest.approx(zeta_omega0, rel=1e-8) + + @pytest.mark.parametrize("variance_method", ["placebo", "bootstrap", "jackknife"]) + def test_detects_true_effect_at_extreme_scale(self, variance_method): + """Pre-fix regression: catastrophic cancellation at Y~1e9 degraded + SEs so p-values clustered near 0.5 regardless of true effect. Here + the true ATT is 0.5% of baseline — below the pre-fix precision + floor — and must still produce a detectable, correctly-scaled τ.""" + rng = np.random.default_rng(0) + n_control, n_treated, n_pre, n_post = 25, 3, 6, 4 + baseline_level = 1e9 # deliberately extreme + true_att = 5e6 # 0.5% of baseline — lost in 6-digit cancellation pre-fix + rows = [] + for unit in range(n_control + n_treated): + is_treated = unit >= n_control + unit_fe = rng.normal(0, 2e6) + for t in range(n_pre + n_post): + y = baseline_level + unit_fe + t * 3e5 + rng.normal(0, 5e5) + if is_treated and t >= n_pre: + y += true_att + rows.append({ + "unit": unit, "period": t, + "treated": int(is_treated), "outcome": y, + }) + data = pd.DataFrame(rows) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + r = SyntheticDiD( + variance_method=variance_method, n_bootstrap=200, seed=7 + ).fit( + data, outcome="outcome", treatment="treated", + unit="unit", time="period", + post_periods=list(range(n_pre, n_pre + n_post)), + ) + + # τ must land near the true effect (within ~3 SE); SE must be + # positive, finite, and small enough that the effect is significant + # by z-statistic. For placebo/jackknife the empirical p-value is a + # proper null-distribution test; for bootstrap the empirical p is + # not a null test (draws center on τ̂), so check z directly. + assert np.isfinite(r.att) and np.isfinite(r.se) + assert r.se > 0 + assert abs(r.att - true_att) < max(3 * r.se, 0.1 * true_att) + z = abs(r.att / r.se) + assert z > 3, ( + f"Effect at Y~1e9 must be detectable by z-stat; att={r.att}, " + f"se={r.se}, z={z} (variance_method={variance_method})" + ) + if variance_method != "bootstrap": + assert r.p_value < 0.05, ( + f"Effect at Y~1e9 must reject null; p_value={r.p_value} " + f"(variance_method={variance_method})" + )