From 8339dcc22da6519e61f1cf8f311e88c8de72ee8a Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 16:12:27 -0400 Subject: [PATCH 1/2] Signal silent sparse -> dense lstsq fallback in ImputationDiD and TwoStageDiD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses axis-C findings #8, #9, and #10 from the silent-failures audit: three sites where a sparse factorization failure silently fell back to dense lstsq without any user-facing signal. - diff_diff/imputation.py:1516 (variance path: scipy.sparse.linalg.spsolve on (A_0' W A_0) z = A_1' w). Bare `except Exception` was swallowing the root cause before dense lstsq. Now emits a UserWarning identifying the exception type and explaining the fallback implication. - diff_diff/two_stage.py:1647 (GMM sandwich: sparse_factorized on X'_{10} W X_{10} for Stage 1 normal equations). `except RuntimeError` was silent; now emits a UserWarning. - diff_diff/two_stage_bootstrap.py:134 (bootstrap path: same pattern as above). `except RuntimeError` was silent; now emits a UserWarning. All three are single-call sites (per fit, or per aggregation level, or per bootstrap replicate at most a handful of times) so no aggregation wrapper pattern is needed — one warning per fallback event is appropriate. REGISTRY.md updated under ImputationDiD and TwoStageDiD. New tests (3): monkey-patch the sparse entry point to raise a RuntimeError, run .fit(), assert the UserWarning fires with the expected message prefix. Works against both the variance and bootstrap surfaces. Axis-C baseline: 3 major silent-fallback sites (imputation, two_stage, two_stage_bootstrap) -> 0 remaining in these files. PowerAnalysis simulation counter (finding #11) and ContinuousDiD B-spline (#12) still open as separate follow-ups. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/imputation.py | 14 +++++++++-- diff_diff/two_stage.py | 14 +++++++++-- diff_diff/two_stage_bootstrap.py | 12 ++++++++- docs/methodology/REGISTRY.md | 3 ++- tests/test_imputation.py | 25 +++++++++++++++++++ tests/test_two_stage.py | 42 ++++++++++++++++++++++++++++++++ 6 files changed, 104 insertions(+), 6 deletions(-) diff --git a/diff_diff/imputation.py b/diff_diff/imputation.py index 3798927e..49a40333 100644 --- a/diff_diff/imputation.py +++ b/diff_diff/imputation.py @@ -1515,8 +1515,18 @@ def _build_A_sparse(df_sub, unit_vals, time_vals): A0tA0_sparse = A_0.T @ A_0 # stays sparse try: z = spsolve(A0tA0_sparse.tocsc(), A1_w) - except Exception: - # Fallback to dense lstsq if sparse solver fails (e.g., singular matrix) + except Exception as exc: + # Fallback to dense lstsq if sparse solver fails (e.g., singular matrix). + # Silent-failure audit axis C: emit a UserWarning on fallback instead + # of swallowing the error. + warnings.warn( + "ImputationDiD variance: sparse solve of (A_0' [W] A_0) z = A_1' w " + f"failed ({type(exc).__name__}); falling back to dense lstsq. This " + "may indicate a rank-deficient or near-singular normal-equations " + "matrix and variance estimates may be less reliable.", + UserWarning, + stacklevel=2, + ) A0tA0_dense = A0tA0_sparse.toarray() z, _, _, _ = np.linalg.lstsq(A0tA0_dense, A1_w, rcond=None) diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py index 560ded0a..6a385bd9 100644 --- a/diff_diff/two_stage.py +++ b/diff_diff/two_stage.py @@ -1652,8 +1652,18 @@ def _compute_gmm_variance( gamma_hat = np.column_stack( [solve_XtX(Xt1_WX2[:, j]) for j in range(Xt1_WX2.shape[1])] ) - except RuntimeError: - # Singular matrix — fall back to dense least-squares + except RuntimeError as exc: + # Singular matrix — fall back to dense least-squares. Silent-failure + # audit axis C: emit a UserWarning on fallback instead of swallowing. + warnings.warn( + "TwoStageDiD GMM sandwich: sparse factorization of " + f"(X'_{{10}} W X_{{10}}) failed ({type(exc).__name__}); falling " + "back to dense lstsq. This may indicate a rank-deficient or " + "near-singular Stage 1 design matrix and SE estimates may be " + "less reliable.", + UserWarning, + stacklevel=2, + ) gamma_hat = np.linalg.lstsq(XtWX_10.toarray(), Xt1_WX2, rcond=None)[0] if gamma_hat.ndim == 1: gamma_hat = gamma_hat.reshape(-1, 1) diff --git a/diff_diff/two_stage_bootstrap.py b/diff_diff/two_stage_bootstrap.py index 3effef4e..ea23ef40 100644 --- a/diff_diff/two_stage_bootstrap.py +++ b/diff_diff/two_stage_bootstrap.py @@ -139,7 +139,17 @@ def _compute_cluster_S_scores( gamma_hat = np.column_stack( [solve_XtX(Xt1_X2[:, j]) for j in range(Xt1_X2.shape[1])] ) - except RuntimeError: + except RuntimeError as exc: + # Silent-failure audit axis C: emit a UserWarning on fallback instead + # of swallowing the error. + warnings.warn( + "TwoStageDiD bootstrap: sparse factorization of X_10' X_10 " + f"failed ({type(exc).__name__}); falling back to dense lstsq. " + "This may indicate a rank-deficient or near-singular Stage 1 " + "design matrix and bootstrap SE estimates may be less reliable.", + UserWarning, + stacklevel=2, + ) gamma_hat = np.linalg.lstsq(XtX_10.toarray(), Xt1_X2, rcond=None)[0] if gamma_hat.ndim == 1: gamma_hat = gamma_hat.reshape(-1, 1) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index cc5b9037..64d2e63a 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -1079,7 +1079,7 @@ where `W_it(h) = 1[K_it = h]` are lead indicators, estimated on `Omega_0` only. - **Non-constant `first_treat` within a unit:** Emits `UserWarning` identifying the count and example unit. The estimator proceeds using the first observed value per unit (via `.first()` aggregation), but results may be unreliable. - **treatment_effects DataFrame weights:** `weight` column uses `1/n_valid` for finite tau_hat and 0 for NaN tau_hat, consistent with the ATT estimand (unweighted), or normalized survey weights `sw_i/sum(sw)` when `survey_design` is active. - **Rank-deficient covariates in variance:** Covariates with NaN coefficients (dropped for rank deficiency in Step 1) are excluded from the variance design matrices `A_0`/`A_1`. Only covariates with finite coefficients participate in the `v_it` projection. -- **Sparse variance solver:** `_compute_v_untreated_with_covariates` uses `scipy.sparse.linalg.spsolve` to solve `(A_0'A_0) z = A_1'w` without densifying the normal equations matrix. Falls back to dense `lstsq` if the sparse solver fails. +- **Sparse variance solver:** `_compute_v_untreated_with_covariates` uses `scipy.sparse.linalg.spsolve` to solve `(A_0'A_0) z = A_1'w` without densifying the normal equations matrix. Falls back to dense `lstsq` if the sparse solver fails and emits a `UserWarning` on the fallback (silent-failure audit axis C) so callers know variance estimates came from the degraded path. - **Note:** Survey weights enter ImputationDiD via weighted iterative FE (Step 1), survey-weighted ATT aggregation (Step 3), and design-based variance via `compute_survey_if_variance()`. PSU clustering, stratification, and FPC are fully supported in the Theorem 3 variance path. When `resolved_survey` is present, the observation-level influence function (`v_it * epsilon_tilde_it`) is passed to `compute_survey_if_variance()` which applies the stratified PSU-level sandwich with FPC correction. Strata also enters survey df (n_PSU - n_strata) for t-distribution inference. Bootstrap + survey supported (Phase 6) via PSU-level multiplier weights. - **Bootstrap inference:** Uses multiplier bootstrap on the Theorem 3 influence function: `psi_i = sum_t v_it * epsilon_tilde_it`. Cluster-level psi sums are pre-computed for each aggregation target (overall, per-horizon, per-group), then perturbed with multiplier weights (Rademacher by default; configurable via `bootstrap_weights` parameter to use Mammen or Webb weights, matching CallawaySantAnna). This is a library extension (not in the paper) consistent with CallawaySantAnna/SunAbraham bootstrap patterns. - **Auxiliary residuals (Equation 8):** Uses v_it-weighted tau_tilde_g formula: `tau_tilde_g = sum(v_it * tau_hat_it) / sum(v_it)` within each partition group. Zero-weight groups (common in event-study SE computation) fall back to unweighted mean. @@ -1162,6 +1162,7 @@ Our implementation uses multiplier bootstrap on the GMM influence function: clus - **Zero-observation cohorts in group effects:** If all treated observations for a cohort have NaN `y_tilde` (excluded from estimation), that cohort's group effect is NaN with n_obs=0. - **Note:** Survey weights in TwoStageDiD GMM sandwich via weighted cross-products: bread uses (X'_2 W X_2)^{-1}, gamma_hat uses (X'_{10} W X_{10})^{-1}(X'_1 W X_2), per-cluster scores multiply by survey weights. PSU clustering, stratification, and FPC are fully supported in the meat matrix via `_compute_stratified_meat_from_psu_scores()`. When strata or FPC are present, the meat computation replaces `S' S` with the stratified formula `sum_h (1 - f_h) * (n_h/(n_h-1)) * centered_h' centered_h`. Strata also enters survey df (n_PSU - n_strata) for t-distribution inference. Bootstrap + survey supported (Phase 6) via PSU-level multiplier weights. - **Note:** Both the iterative FE solver (`_iterative_fe`, Stage 1) and the iterative alternating-projection demeaning helper (`_iterative_demean`, used in covariate residualization) emit `UserWarning` when `max_iter` exhausts without reaching `tol`, via `diff_diff.utils.warn_if_not_converged`. Silent return of the current iterate was classified as a silent failure under the Phase 2 audit and replaced with an explicit signal to match the logistic/Poisson IRLS pattern in `linalg.py`. +- **Note:** The GMM sandwich and bootstrap paths both use `scipy.sparse.linalg.factorized` for the Stage 1 normal-equations solve `(X'_{10} W X_{10}) gamma = X'_1 W X_2` and fall back to dense `lstsq` when the sparse factorization raises `RuntimeError` on a near-singular matrix. Both fallback sites emit a `UserWarning` (silent-failure audit axis C) so callers know SE estimates came from the degraded path rather than the fast sparse path. **Reference implementation(s):** - R: `did2s::did2s()` (Kyle Butts & John Gardner) diff --git a/tests/test_imputation.py b/tests/test_imputation.py index 53093d8d..4e8cf9ce 100644 --- a/tests/test_imputation.py +++ b/tests/test_imputation.py @@ -885,6 +885,31 @@ def test_sparse_solver_dense_fallback(self): assert np.isfinite(results.overall_se) assert results.overall_se > 0 + def test_sparse_solver_dense_fallback_emits_warning(self): + """Silent-failure audit axis C: the sparse -> dense lstsq fallback must + emit a UserWarning so callers are informed that variance estimates come + from the degraded path.""" + import unittest.mock + + data = generate_test_data(n_units=80, n_periods=8, seed=42) + rng = np.random.default_rng(42) + data["x1"] = rng.standard_normal(len(data)) + + est = ImputationDiD() + + with unittest.mock.patch( + "diff_diff.imputation.spsolve", side_effect=RuntimeError("test failure") + ): + with pytest.warns(UserWarning, match="sparse solve.*falling back to dense lstsq"): + est.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + covariates=["x1"], + ) + # ============================================================================= # TestImputationBootstrap diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py index ebf093be..335ff942 100644 --- a/tests/test_two_stage.py +++ b/tests/test_two_stage.py @@ -490,6 +490,48 @@ def test_event_study_se_positive(self): assert eff["se"] > 0, f"SE at h={h} should be positive" assert np.isfinite(eff["se"]) + def test_sparse_factorized_dense_fallback_emits_warning(self): + """Silent-failure audit axis C: when sparse factorization of Stage 1's + normal-equations matrix fails and the GMM sandwich falls back to dense + lstsq, a UserWarning must surface so callers know SE came from the + degraded path rather than the fast sparse path.""" + import unittest.mock + + data = generate_test_data() + + with unittest.mock.patch( + "diff_diff.two_stage.sparse_factorized", + side_effect=RuntimeError("test failure"), + ): + with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"): + TwoStageDiD().fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + ) + + def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self): + """Silent-failure audit axis C: the TwoStage bootstrap path has the + same sparse->dense fallback and must also emit a UserWarning.""" + import unittest.mock + + data = generate_test_data() + + with unittest.mock.patch( + "diff_diff.two_stage_bootstrap.sparse_factorized", + side_effect=RuntimeError("test failure"), + ): + with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"): + TwoStageDiD(n_bootstrap=4, seed=42).fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + ) + # ============================================================================= # TestTwoStageDiDEdgeCases From 28507ec7fd32a44cfecc87fdc628d0eca3e1158e Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 16:24:32 -0400 Subject: [PATCH 2/2] Assert dense fallback SEs remain usable in TwoStage sparse->dense tests AI review on PR #319 flagged that my new TwoStage warning tests only verify that the UserWarning fires but not that the dense fallback still produces finite, usable SEs. A future control-flow regression could keep the warning while breaking the degraded path. Mirror the assertion shape used in the pre-existing ImputationDiD test_sparse_solver_dense_fallback: after the warned .fit(), assert overall_se is finite and > 0 for both the GMM sandwich and bootstrap paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_two_stage.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py index 335ff942..cf5a47b1 100644 --- a/tests/test_two_stage.py +++ b/tests/test_two_stage.py @@ -494,7 +494,11 @@ def test_sparse_factorized_dense_fallback_emits_warning(self): """Silent-failure audit axis C: when sparse factorization of Stage 1's normal-equations matrix fails and the GMM sandwich falls back to dense lstsq, a UserWarning must surface so callers know SE came from the - degraded path rather than the fast sparse path.""" + degraded path rather than the fast sparse path. + + Also verifies the dense fallback still yields finite, usable SEs so + that a future regression in the fallback control flow cannot keep the + warning while breaking the degraded path.""" import unittest.mock data = generate_test_data() @@ -504,7 +508,7 @@ def test_sparse_factorized_dense_fallback_emits_warning(self): side_effect=RuntimeError("test failure"), ): with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"): - TwoStageDiD().fit( + results = TwoStageDiD().fit( data, outcome="outcome", unit="unit", @@ -512,9 +516,16 @@ def test_sparse_factorized_dense_fallback_emits_warning(self): first_treat="first_treat", ) + # Dense fallback must still produce a usable SE. + assert np.isfinite(results.overall_se) + assert results.overall_se > 0 + def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self): """Silent-failure audit axis C: the TwoStage bootstrap path has the - same sparse->dense fallback and must also emit a UserWarning.""" + same sparse->dense fallback and must also emit a UserWarning. + + Also verifies the bootstrap dense fallback still yields finite, + usable SEs.""" import unittest.mock data = generate_test_data() @@ -524,7 +535,7 @@ def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self): side_effect=RuntimeError("test failure"), ): with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"): - TwoStageDiD(n_bootstrap=4, seed=42).fit( + results = TwoStageDiD(n_bootstrap=4, seed=42).fit( data, outcome="outcome", unit="unit", @@ -532,6 +543,10 @@ def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self): first_treat="first_treat", ) + # Bootstrap dense fallback must still produce a usable SE. + assert np.isfinite(results.overall_se) + assert results.overall_se > 0 + # ============================================================================= # TestTwoStageDiDEdgeCases