From 8339dcc22da6519e61f1cf8f311e88c8de72ee8a Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 16:12:27 -0400
Subject: [PATCH 1/2] Signal silent sparse -> dense lstsq fallback in
 ImputationDiD and TwoStageDiD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses axis-C findings #8, #9, and #10 from the silent-failures audit:
three sites where a sparse factorization failure silently fell back to
dense lstsq without any user-facing signal.

- diff_diff/imputation.py:1516 (variance path: scipy.sparse.linalg.spsolve
  on (A_0' W A_0) z = A_1' w). Bare `except Exception` was swallowing
  the root cause before dense lstsq. Now emits a UserWarning identifying
  the exception type and explaining the fallback implication.
- diff_diff/two_stage.py:1647 (GMM sandwich: sparse_factorized on
  X'_{10} W X_{10} for Stage 1 normal equations). `except RuntimeError`
  was silent; now emits a UserWarning.
- diff_diff/two_stage_bootstrap.py:134 (bootstrap path: same pattern as
  above). `except RuntimeError` was silent; now emits a UserWarning.

All three are single-call sites (per fit, or per aggregation level, or
per bootstrap replicate at most a handful of times) so no aggregation
wrapper pattern is needed — one warning per fallback event is
appropriate.

REGISTRY.md updated under ImputationDiD and TwoStageDiD.

New tests (3): monkey-patch the sparse entry point to raise a
RuntimeError, run .fit(), assert the UserWarning fires with the
expected message prefix. Works against both the variance and bootstrap
surfaces.

Axis-C baseline: 3 major silent-fallback sites (imputation, two_stage,
two_stage_bootstrap) -> 0 remaining in these files. PowerAnalysis
simulation counter (finding #11) and ContinuousDiD B-spline (#12)
still open as separate follow-ups.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/imputation.py          | 14 +++++++++--
 diff_diff/two_stage.py           | 14 +++++++++--
 diff_diff/two_stage_bootstrap.py | 12 ++++++++-
 docs/methodology/REGISTRY.md     |  3 ++-
 tests/test_imputation.py         | 25 +++++++++++++++++++
 tests/test_two_stage.py          | 42 ++++++++++++++++++++++++++++++++
 6 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/diff_diff/imputation.py b/diff_diff/imputation.py
index 3798927e..49a40333 100644
--- a/diff_diff/imputation.py
+++ b/diff_diff/imputation.py
@@ -1515,8 +1515,18 @@ def _build_A_sparse(df_sub, unit_vals, time_vals):
             A0tA0_sparse = A_0.T @ A_0  # stays sparse
         try:
             z = spsolve(A0tA0_sparse.tocsc(), A1_w)
-        except Exception:
-            # Fallback to dense lstsq if sparse solver fails (e.g., singular matrix)
+        except Exception as exc:
+            # Fallback to dense lstsq if sparse solver fails (e.g., singular matrix).
+            # Silent-failure audit axis C: emit a UserWarning on fallback instead
+            # of swallowing the error.
+            warnings.warn(
+                "ImputationDiD variance: sparse solve of (A_0' [W] A_0) z = A_1' w "
+                f"failed ({type(exc).__name__}); falling back to dense lstsq. This "
+                "may indicate a rank-deficient or near-singular normal-equations "
+                "matrix and variance estimates may be less reliable.",
+                UserWarning,
+                stacklevel=2,
+            )
             A0tA0_dense = A0tA0_sparse.toarray()
             z, _, _, _ = np.linalg.lstsq(A0tA0_dense, A1_w, rcond=None)
 
diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py
index 560ded0a..6a385bd9 100644
--- a/diff_diff/two_stage.py
+++ b/diff_diff/two_stage.py
@@ -1652,8 +1652,18 @@ def _compute_gmm_variance(
                 gamma_hat = np.column_stack(
                     [solve_XtX(Xt1_WX2[:, j]) for j in range(Xt1_WX2.shape[1])]
                 )
-        except RuntimeError:
-            # Singular matrix — fall back to dense least-squares
+        except RuntimeError as exc:
+            # Singular matrix — fall back to dense least-squares. Silent-failure
+            # audit axis C: emit a UserWarning on fallback instead of swallowing.
+            warnings.warn(
+                "TwoStageDiD GMM sandwich: sparse factorization of "
+                f"(X'_{{10}} W X_{{10}}) failed ({type(exc).__name__}); falling "
+                "back to dense lstsq. This may indicate a rank-deficient or "
+                "near-singular Stage 1 design matrix and SE estimates may be "
+                "less reliable.",
+                UserWarning,
+                stacklevel=2,
+            )
             gamma_hat = np.linalg.lstsq(XtWX_10.toarray(), Xt1_WX2, rcond=None)[0]
             if gamma_hat.ndim == 1:
                 gamma_hat = gamma_hat.reshape(-1, 1)
diff --git a/diff_diff/two_stage_bootstrap.py b/diff_diff/two_stage_bootstrap.py
index 3effef4e..ea23ef40 100644
--- a/diff_diff/two_stage_bootstrap.py
+++ b/diff_diff/two_stage_bootstrap.py
@@ -139,7 +139,17 @@ def _compute_cluster_S_scores(
                 gamma_hat = np.column_stack(
                     [solve_XtX(Xt1_X2[:, j]) for j in range(Xt1_X2.shape[1])]
                 )
-        except RuntimeError:
+        except RuntimeError as exc:
+            # Silent-failure audit axis C: emit a UserWarning on fallback instead
+            # of swallowing the error.
+            warnings.warn(
+                "TwoStageDiD bootstrap: sparse factorization of X_10' X_10 "
+                f"failed ({type(exc).__name__}); falling back to dense lstsq. "
+                "This may indicate a rank-deficient or near-singular Stage 1 "
+                "design matrix and bootstrap SE estimates may be less reliable.",
+                UserWarning,
+                stacklevel=2,
+            )
             gamma_hat = np.linalg.lstsq(XtX_10.toarray(), Xt1_X2, rcond=None)[0]
             if gamma_hat.ndim == 1:
                 gamma_hat = gamma_hat.reshape(-1, 1)
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index cc5b9037..64d2e63a 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -1079,7 +1079,7 @@ where `W_it(h) = 1[K_it = h]` are lead indicators, estimated on `Omega_0` only.
 - **Non-constant `first_treat` within a unit:** Emits `UserWarning` identifying the count and example unit. The estimator proceeds using the first observed value per unit (via `.first()` aggregation), but results may be unreliable.
 - **treatment_effects DataFrame weights:** `weight` column uses `1/n_valid` for finite tau_hat and 0 for NaN tau_hat, consistent with the ATT estimand (unweighted), or normalized survey weights `sw_i/sum(sw)` when `survey_design` is active.
 - **Rank-deficient covariates in variance:** Covariates with NaN coefficients (dropped for rank deficiency in Step 1) are excluded from the variance design matrices `A_0`/`A_1`. Only covariates with finite coefficients participate in the `v_it` projection.
-- **Sparse variance solver:** `_compute_v_untreated_with_covariates` uses `scipy.sparse.linalg.spsolve` to solve `(A_0'A_0) z = A_1'w` without densifying the normal equations matrix. Falls back to dense `lstsq` if the sparse solver fails.
+- **Sparse variance solver:** `_compute_v_untreated_with_covariates` uses `scipy.sparse.linalg.spsolve` to solve `(A_0'A_0) z = A_1'w` without densifying the normal equations matrix. Falls back to dense `lstsq` if the sparse solver fails and emits a `UserWarning` on the fallback (silent-failure audit axis C) so callers know variance estimates came from the degraded path.
 - **Note:** Survey weights enter ImputationDiD via weighted iterative FE (Step 1), survey-weighted ATT aggregation (Step 3), and design-based variance via `compute_survey_if_variance()`. PSU clustering, stratification, and FPC are fully supported in the Theorem 3 variance path. When `resolved_survey` is present, the observation-level influence function (`v_it * epsilon_tilde_it`) is passed to `compute_survey_if_variance()` which applies the stratified PSU-level sandwich with FPC correction. Strata also enters survey df (n_PSU - n_strata) for t-distribution inference. Bootstrap + survey supported (Phase 6) via PSU-level multiplier weights.
 - **Bootstrap inference:** Uses multiplier bootstrap on the Theorem 3 influence function: `psi_i = sum_t v_it * epsilon_tilde_it`. Cluster-level psi sums are pre-computed for each aggregation target (overall, per-horizon, per-group), then perturbed with multiplier weights (Rademacher by default; configurable via `bootstrap_weights` parameter to use Mammen or Webb weights, matching CallawaySantAnna). This is a library extension (not in the paper) consistent with CallawaySantAnna/SunAbraham bootstrap patterns.
 - **Auxiliary residuals (Equation 8):** Uses v_it-weighted tau_tilde_g formula: `tau_tilde_g = sum(v_it * tau_hat_it) / sum(v_it)` within each partition group. Zero-weight groups (common in event-study SE computation) fall back to unweighted mean.
@@ -1162,6 +1162,7 @@ Our implementation uses multiplier bootstrap on the GMM influence function: clus
 - **Zero-observation cohorts in group effects:** If all treated observations for a cohort have NaN `y_tilde` (excluded from estimation), that cohort's group effect is NaN with n_obs=0.
 - **Note:** Survey weights in TwoStageDiD GMM sandwich via weighted cross-products: bread uses (X'_2 W X_2)^{-1}, gamma_hat uses (X'_{10} W X_{10})^{-1}(X'_1 W X_2), per-cluster scores multiply by survey weights. PSU clustering, stratification, and FPC are fully supported in the meat matrix via `_compute_stratified_meat_from_psu_scores()`. When strata or FPC are present, the meat computation replaces `S' S` with the stratified formula `sum_h (1 - f_h) * (n_h/(n_h-1)) * centered_h' centered_h`. Strata also enters survey df (n_PSU - n_strata) for t-distribution inference. Bootstrap + survey supported (Phase 6) via PSU-level multiplier weights.
 - **Note:** Both the iterative FE solver (`_iterative_fe`, Stage 1) and the iterative alternating-projection demeaning helper (`_iterative_demean`, used in covariate residualization) emit `UserWarning` when `max_iter` exhausts without reaching `tol`, via `diff_diff.utils.warn_if_not_converged`. Silent return of the current iterate was classified as a silent failure under the Phase 2 audit and replaced with an explicit signal to match the logistic/Poisson IRLS pattern in `linalg.py`.
+- **Note:** The GMM sandwich and bootstrap paths both use `scipy.sparse.linalg.factorized` for the Stage 1 normal-equations solve `(X'_{10} W X_{10}) gamma = X'_1 W X_2` and fall back to dense `lstsq` when the sparse factorization raises `RuntimeError` on a near-singular matrix. Both fallback sites emit a `UserWarning` (silent-failure audit axis C) so callers know SE estimates came from the degraded path rather than the fast sparse path.
 
 **Reference implementation(s):**
 - R: `did2s::did2s()` (Kyle Butts & John Gardner)
diff --git a/tests/test_imputation.py b/tests/test_imputation.py
index 53093d8d..4e8cf9ce 100644
--- a/tests/test_imputation.py
+++ b/tests/test_imputation.py
@@ -885,6 +885,31 @@ def test_sparse_solver_dense_fallback(self):
         assert np.isfinite(results.overall_se)
         assert results.overall_se > 0
 
+    def test_sparse_solver_dense_fallback_emits_warning(self):
+        """Silent-failure audit axis C: the sparse -> dense lstsq fallback must
+        emit a UserWarning so callers are informed that variance estimates come
+        from the degraded path."""
+        import unittest.mock
+
+        data = generate_test_data(n_units=80, n_periods=8, seed=42)
+        rng = np.random.default_rng(42)
+        data["x1"] = rng.standard_normal(len(data))
+
+        est = ImputationDiD()
+
+        with unittest.mock.patch(
+            "diff_diff.imputation.spsolve", side_effect=RuntimeError("test failure")
+        ):
+            with pytest.warns(UserWarning, match="sparse solve.*falling back to dense lstsq"):
+                est.fit(
+                    data,
+                    outcome="outcome",
+                    unit="unit",
+                    time="time",
+                    first_treat="first_treat",
+                    covariates=["x1"],
+                )
+
 
 # =============================================================================
 # TestImputationBootstrap
diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py
index ebf093be..335ff942 100644
--- a/tests/test_two_stage.py
+++ b/tests/test_two_stage.py
@@ -490,6 +490,48 @@ def test_event_study_se_positive(self):
                 assert eff["se"] > 0, f"SE at h={h} should be positive"
                 assert np.isfinite(eff["se"])
 
+    def test_sparse_factorized_dense_fallback_emits_warning(self):
+        """Silent-failure audit axis C: when sparse factorization of Stage 1's
+        normal-equations matrix fails and the GMM sandwich falls back to dense
+        lstsq, a UserWarning must surface so callers know SE came from the
+        degraded path rather than the fast sparse path."""
+        import unittest.mock
+
+        data = generate_test_data()
+
+        with unittest.mock.patch(
+            "diff_diff.two_stage.sparse_factorized",
+            side_effect=RuntimeError("test failure"),
+        ):
+            with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"):
+                TwoStageDiD().fit(
+                    data,
+                    outcome="outcome",
+                    unit="unit",
+                    time="time",
+                    first_treat="first_treat",
+                )
+
+    def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self):
+        """Silent-failure audit axis C: the TwoStage bootstrap path has the
+        same sparse->dense fallback and must also emit a UserWarning."""
+        import unittest.mock
+
+        data = generate_test_data()
+
+        with unittest.mock.patch(
+            "diff_diff.two_stage_bootstrap.sparse_factorized",
+            side_effect=RuntimeError("test failure"),
+        ):
+            with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"):
+                TwoStageDiD(n_bootstrap=4, seed=42).fit(
+                    data,
+                    outcome="outcome",
+                    unit="unit",
+                    time="time",
+                    first_treat="first_treat",
+                )
+
 
 # =============================================================================
 # TestTwoStageDiDEdgeCases

From 28507ec7fd32a44cfecc87fdc628d0eca3e1158e Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 16:24:32 -0400
Subject: [PATCH 2/2] Assert dense fallback SEs remain usable in TwoStage
 sparse->dense tests

AI review on PR #319 flagged that my new TwoStage warning tests only
verify that the UserWarning fires but not that the dense fallback still
produces finite, usable SEs. A future control-flow regression could
keep the warning while breaking the degraded path.

Mirror the assertion shape used in the pre-existing ImputationDiD
test_sparse_solver_dense_fallback: after the warned .fit(), assert
overall_se is finite and > 0 for both the GMM sandwich and bootstrap
paths.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_two_stage.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py
index 335ff942..cf5a47b1 100644
--- a/tests/test_two_stage.py
+++ b/tests/test_two_stage.py
@@ -494,7 +494,11 @@ def test_sparse_factorized_dense_fallback_emits_warning(self):
         """Silent-failure audit axis C: when sparse factorization of Stage 1's
         normal-equations matrix fails and the GMM sandwich falls back to dense
         lstsq, a UserWarning must surface so callers know SE came from the
-        degraded path rather than the fast sparse path."""
+        degraded path rather than the fast sparse path.
+
+        Also verifies the dense fallback still yields finite, usable SEs so
+        that a future regression in the fallback control flow cannot keep the
+        warning while breaking the degraded path."""
         import unittest.mock
 
         data = generate_test_data()
@@ -504,7 +508,7 @@ def test_sparse_factorized_dense_fallback_emits_warning(self):
             side_effect=RuntimeError("test failure"),
         ):
             with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"):
-                TwoStageDiD().fit(
+                results = TwoStageDiD().fit(
                     data,
                     outcome="outcome",
                     unit="unit",
@@ -512,9 +516,16 @@ def test_sparse_factorized_dense_fallback_emits_warning(self):
                     first_treat="first_treat",
                 )
 
+        # Dense fallback must still produce a usable SE.
+        assert np.isfinite(results.overall_se)
+        assert results.overall_se > 0
+
     def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self):
         """Silent-failure audit axis C: the TwoStage bootstrap path has the
-        same sparse->dense fallback and must also emit a UserWarning."""
+        same sparse->dense fallback and must also emit a UserWarning.
+
+        Also verifies the bootstrap dense fallback still yields finite,
+        usable SEs."""
         import unittest.mock
 
         data = generate_test_data()
@@ -524,7 +535,7 @@ def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self):
             side_effect=RuntimeError("test failure"),
         ):
             with pytest.warns(UserWarning, match="sparse factorization.*falling back to dense lstsq"):
-                TwoStageDiD(n_bootstrap=4, seed=42).fit(
+                results = TwoStageDiD(n_bootstrap=4, seed=42).fit(
                     data,
                     outcome="outcome",
                     unit="unit",
@@ -532,6 +543,10 @@ def test_sparse_factorized_bootstrap_dense_fallback_emits_warning(self):
                     first_treat="first_treat",
                 )
 
+        # Bootstrap dense fallback must still produce a usable SE.
+        assert np.isfinite(results.overall_se)
+        assert results.overall_se > 0
+
 
 # =============================================================================
 # TestTwoStageDiDEdgeCases