From 8b1914c492af698dd99b4ae1a0eb351825f5edfc Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 19:35:16 -0400
Subject: [PATCH 1/3] Surface PowerAnalysis simulation-failure count and narrow
 except clause
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two silent-failure patterns at `power.py:2241` addressed together:

1. Bare `except Exception` absorbed every exception a user-supplied
   estimator or result extractor could raise, including programming
   errors (TypeError, AttributeError, IndexError, ...). Those masked
   bugs counted as "simulation failures" and were aggregated as a
   reliability signal instead of propagating. Narrowed the catch to
   `(ValueError, np.linalg.LinAlgError, KeyError, RuntimeError,
   ZeroDivisionError)` — the set that reasonable DGPs and fit paths
   can raise on adversarial samples.

2. The internal `n_failures` counter was per-effect-size and thrown
   away after the outer loop, leaving users with no programmatic way
   to check whether a run was clean. Surface the primary-effect
   failure count on the results object as
   `SimulationPowerResults.n_simulation_failures` and include it in
   `summary()` output. The proportional > 10% `UserWarning` and the
   raise-if-all-fail escape are preserved.

Covered by audit axis C (silent fallback). Finding #11 from
`docs/audits/silent-failures-findings.md`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/power.py           |  21 ++++++-
 docs/methodology/REGISTRY.md |   1 +
 tests/test_power.py          | 104 +++++++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/diff_diff/power.py b/diff_diff/power.py
index 87959b0a..a464b90b 100644
--- a/diff_diff/power.py
+++ b/diff_diff/power.py
@@ -995,7 +995,13 @@ class SimulationPowerResults:
     coverage : float
         Proportion of CIs containing true effect.
     n_simulations : int
-        Number of simulations performed.
+        Number of simulations performed (successful count; see
+        ``n_simulation_failures`` for failed-replicate count).
+    n_simulation_failures : int
+        Number of simulations at the primary effect size whose `estimator.fit`
+        (or result extraction) raised an exception and was skipped. Lets
+        callers programmatically detect fragile DGP/estimator pairings; a
+        proportional warning is also emitted above a 10% failure rate.
     effect_sizes : List[float]
         Effect sizes tested (if multiple).
     powers : List[float]
@@ -1020,6 +1026,7 @@ class SimulationPowerResults:
     mean_se: float
     coverage: float
     n_simulations: int
+    n_simulation_failures: int
     effect_sizes: List[float]
     powers: List[float]
     true_effect: float
@@ -1062,6 +1069,7 @@ def summary(self) -> str:
             "",
             f"{'Estimator:':<35} {self.estimator_name}",
             f"{'Number of simulations:':<35} {self.n_simulations}",
+            f"{'Simulation failures:':<35} {self.n_simulation_failures}",
             f"{'True treatment effect:':<35} {self.true_effect:.4f}",
             f"{'Significance level (alpha):':<35} {self.alpha:.3f}",
             "",
@@ -2097,6 +2105,7 @@ def simulate_power(
     primary_p_values: List[float] = []
     primary_rejections: List[bool] = []
     primary_ci_contains: List[bool] = []
+    primary_n_failures = 0
 
     # Survey DGP truth accumulation (DEFF/ICC are DGP properties,
     # independent of effect size, so averaging across all sims is correct)
@@ -2238,7 +2247,13 @@ def simulate_power(
                 rejections.append(rejected)
                 ci_contains_true.append(ci[0] <= effect <= ci[1])
 
-            except Exception as e:
+            except (
+                ValueError,
+                np.linalg.LinAlgError,
+                KeyError,
+                RuntimeError,
+                ZeroDivisionError,
+            ) as e:
                 n_failures += 1
                 if progress:
                     print(f"  Warning: Simulation {sim} failed: {e}")
@@ -2266,6 +2281,7 @@ def simulate_power(
             primary_p_values = p_values
             primary_rejections = rejections
             primary_ci_contains = ci_contains_true
+            primary_n_failures = n_failures
 
     # Compute confidence interval for power (primary effect)
     power_val = all_powers[primary_idx]
@@ -2292,6 +2308,7 @@ def simulate_power(
         mean_se=mean_se,
         coverage=coverage,
         n_simulations=n_valid,
+        n_simulation_failures=primary_n_failures,
         effect_sizes=effect_sizes,
         powers=all_powers,
         true_effect=primary_effect,
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index a7f11455..27f2a419 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -2353,6 +2353,7 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
 - **Note:** The simulation-based power registry (`simulate_power`, `simulate_mde`, `simulate_sample_size`) uses a single-cohort staggered DGP by default. Estimators configured with `control_group="not_yet_treated"`, `clean_control="strict"`, or `anticipation>0` will receive a `UserWarning` because the default DGP does not match their identification strategy. Users must supply `data_generator_kwargs` (e.g., `cohort_periods=[2, 4]`, `never_treated_frac=0.0`) or a custom `data_generator` to match the estimator design.
 - **Note:** The `TripleDifference` registry adapter uses `generate_ddd_data`, a fixed 2×2×2 factorial DGP (group × partition × time). The `n_periods`, `treatment_period`, and `treatment_fraction` parameters are ignored — DDD always simulates 2 periods with balanced groups. `n_units` is mapped to `n_per_cell = max(2, n_units // 8)` (effective total N = `n_per_cell × 8`), so non-multiples of 8 are rounded down and values below 16 are clamped to 16. A `UserWarning` is emitted when simulation inputs differ from the effective DDD design. When rounding occurs, all result objects (`SimulationPowerResults`, `SimulationMDEResults`, `SimulationSampleSizeResults`) set `effective_n_units` to the actual sample size used; it is `None` when no rounding occurred. `simulate_sample_size()` snaps bisection candidates to multiples of 8 so that `required_n` is always a realizable DDD sample size. Passing `n_per_cell` in `data_generator_kwargs` suppresses the effective-N rounding warning but not warnings for ignored parameters (`n_periods`, `treatment_period`, `treatment_fraction`).
 - **Note:** The analytical power methods (`PowerAnalysis.power/mde/sample_size` and the `compute_power/compute_mde/compute_sample_size` convenience functions) accept a `deff` parameter (survey design effect, default 1.0). This inflates variance multiplicatively: `Var(ATT) *= deff`, and inflates required sample size: `n_total *= deff`. The `deff` parameter is **not redundant** with `rho` (intra-cluster correlation): `rho` models within-unit serial correlation in panel data via the Moulton factor `1 + (T-1)*rho`, while `deff` models the survey design effect from stratified multi-stage sampling (clustering + unequal weighting). A survey panel study may need both. Values `deff > 0` are accepted; `deff < 1.0` (net variance reduction, e.g., from stratification gain) emits a warning.
+- **Note:** `simulate_power()` catches a narrow set of exception types — `ValueError`, `numpy.linalg.LinAlgError`, `KeyError`, `RuntimeError`, `ZeroDivisionError` — raised inside the per-simulation fit and result-extraction block, increments a per-effect failure counter, and skips the replicate. Programming errors (`TypeError`, `AttributeError`, `NameError`, `IndexError`, etc.) are allowed to propagate so that bugs in the estimator or custom result extractor surface loudly instead of being absorbed as simulation failures. The primary-effect failure count is surfaced on the result object as `SimulationPowerResults.n_simulation_failures`; a `UserWarning` still fires when the failure rate exceeds 10% for any effect size, and all-failed runs raise `RuntimeError`. This replaces the prior bare `except Exception` that swallowed root causes and kept the counter internal to the function (axis C — silent fallback — under the Phase 2 audit).
 - **Note:** The simulation-based power functions (`simulate_power/simulate_mde/simulate_sample_size`) accept a `survey_config` parameter (`SurveyPowerConfig` dataclass). When set, the simulation loop uses `generate_survey_did_data` instead of the default registry DGP, and automatically injects `SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc")` into the estimator's `fit()` call. Supported estimators: DifferenceInDifferences, TwoWayFixedEffects, MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD, TwoStageDiD, StackedDiD, EfficientDiD. Unsupported (raises `ValueError`): TROP, SyntheticDiD, TripleDifference (generate_survey_did_data produces staggered cohort data incompatible with factor-model/DDD DGPs). `survey_config` and `data_generator` are mutually exclusive. `data_generator_kwargs` may not contain keys managed by `SurveyPowerConfig` (n_strata, psu_per_stratum, etc.) but may contain passthrough DGP params (unit_fe_sd, add_covariates, strata_sizes). Repeated cross-section survey power (`panel=False`) is only supported for `CallawaySantAnna(panel=False)` with a matching `data_generator_kwargs={"panel": False}`; both mismatch directions are rejected. `estimator_kwargs` may not contain `survey_design` when `survey_config` is set (use `SurveyPowerConfig(survey_design=...)` instead). Estimator settings that require a multi-cohort DGP (`control_group="not_yet_treated"`, `control_group="last_cohort"`, `clean_control="strict"`) are rejected because the survey DGP uses a single cohort; use the custom `data_generator` path for these configurations. `simulate_sample_size` raises the bisection floor to `n_strata * psu_per_stratum * 2` to ensure viable survey structure and rejects `strata_sizes` in `data_generator_kwargs` (it depends on `n_units` which varies during bisection).
 
 **Reference implementation(s):**
diff --git a/tests/test_power.py b/tests/test_power.py
index 793f56c9..e3ab3622 100644
--- a/tests/test_power.py
+++ b/tests/test_power.py
@@ -556,6 +556,110 @@ class Result:
             # Should have completed successfully without warning
             assert len([x for x in w if "simulations" in str(x.message)]) == 0
 
+    def test_simulation_failure_counter_on_result(self):
+        """`n_simulation_failures` on the result object surfaces the internal counter."""
+        from diff_diff.prep import generate_did_data
+
+        class AlternatingFailingEstimator:
+            """Raises ValueError on every other call — ~50% failure rate."""
+
+            def __init__(self):
+                self.call_count = 0
+
+            def fit(self, data, **kwargs):
+                self.call_count += 1
+                if self.call_count % 2 == 0:
+                    raise ValueError("forced simulated failure")
+
+                class Result:
+                    att = 5.0
+                    se = 1.0
+                    p_value = 0.01
+                    conf_int = (3.0, 7.0)
+
+                return Result()
+
+        estimator = AlternatingFailingEstimator()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            results = simulate_power(
+                estimator=estimator,
+                n_simulations=20,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+        assert results.n_simulation_failures == 10
+        assert results.n_simulations == 10
+        assert results.n_simulation_failures + results.n_simulations == 20
+
+    def test_simulation_failure_counter_zero_on_clean_run(self):
+        """Clean run: counter is exactly 0, not omitted or None."""
+        did = DifferenceInDifferences()
+        results = simulate_power(
+            estimator=did,
+            n_units=50,
+            n_periods=4,
+            treatment_effect=5.0,
+            sigma=2.0,
+            n_simulations=15,
+            seed=42,
+            progress=False,
+        )
+        assert results.n_simulation_failures == 0
+
+    def test_simulation_does_not_swallow_programming_errors(self):
+        """`TypeError` (programming error) must propagate, not be absorbed as a failure."""
+        from diff_diff.prep import generate_did_data
+
+        class TypeErrorEstimator:
+            """Raises TypeError — a programming bug signal, not a DGP failure."""
+
+            def fit(self, data, **kwargs):
+                raise TypeError("programming bug — must propagate")
+
+        with pytest.raises(TypeError, match="programming bug"):
+            simulate_power(
+                estimator=TypeErrorEstimator(),
+                n_simulations=5,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+    def test_simulation_failure_rate_warning_above_threshold(self):
+        """10% threshold: >10% failure still warns with the per-effect-size message."""
+        from diff_diff.prep import generate_did_data
+
+        class MostlyFailingEstimator:
+            """Fails 16/20 calls (80% failure rate) — triggers warning."""
+
+            def __init__(self):
+                self.call_count = 0
+
+            def fit(self, data, **kwargs):
+                self.call_count += 1
+                if self.call_count % 5 != 0:
+                    raise ValueError("forced failure")
+
+                class Result:
+                    att = 5.0
+                    se = 1.0
+                    p_value = 0.01
+                    conf_int = (3.0, 7.0)
+
+                return Result()
+
+        with pytest.warns(UserWarning, match=r"simulations .* failed for effect_size="):
+            results = simulate_power(
+                estimator=MostlyFailingEstimator(),
+                n_simulations=20,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+        assert results.n_simulation_failures == 16
+        assert results.n_simulations == 4
+
 
 class TestVisualization:
     """Tests for power curve visualization."""

From af56950caafd496a5675963505c5c47cb5eb7f28 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 19:36:29 -0400
Subject: [PATCH 2/3] Address local AI review: include n_simulation_failures in
 to_dict()

The two P2 findings from local AI review were one issue: the new
`n_simulation_failures` field was added to the dataclass and `summary()`
but not to `to_dict()`. That left a gap between the in-memory result and
the JSON/DataFrame serialization path used by notebooks and pipelines.

- Add `n_simulation_failures` to `SimulationPowerResults.to_dict()`.
- Add a regression test asserting the field round-trips through
  `to_dict()` after a partially-failing run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/power.py  |  1 +
 tests/test_power.py | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/diff_diff/power.py b/diff_diff/power.py
index a464b90b..750bec3f 100644
--- a/diff_diff/power.py
+++ b/diff_diff/power.py
@@ -1138,6 +1138,7 @@ def to_dict(self) -> Dict[str, Any]:
             "mean_se": self.mean_se,
             "coverage": self.coverage,
             "n_simulations": self.n_simulations,
+            "n_simulation_failures": self.n_simulation_failures,
             "true_effect": self.true_effect,
             "alpha": self.alpha,
             "estimator_name": self.estimator_name,
diff --git a/tests/test_power.py b/tests/test_power.py
index e3ab3622..b7107a6f 100644
--- a/tests/test_power.py
+++ b/tests/test_power.py
@@ -626,6 +626,40 @@ def fit(self, data, **kwargs):
                 data_generator=generate_did_data,
             )
 
+    def test_simulation_failure_counter_survives_serialization(self):
+        """`n_simulation_failures` round-trips through to_dict/to_dataframe."""
+        from diff_diff.prep import generate_did_data
+
+        class AlternatingFailingEstimator:
+            def __init__(self):
+                self.call_count = 0
+
+            def fit(self, data, **kwargs):
+                self.call_count += 1
+                if self.call_count % 2 == 0:
+                    raise ValueError("forced simulated failure")
+
+                class Result:
+                    att = 5.0
+                    se = 1.0
+                    p_value = 0.01
+                    conf_int = (3.0, 7.0)
+
+                return Result()
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            results = simulate_power(
+                estimator=AlternatingFailingEstimator(),
+                n_simulations=20,
+                progress=False,
+                data_generator=generate_did_data,
+            )
+
+        serialized = results.to_dict()
+        assert "n_simulation_failures" in serialized
+        assert serialized["n_simulation_failures"] == results.n_simulation_failures == 10
+
     def test_simulation_failure_rate_warning_above_threshold(self):
         """10% threshold: >10% failure still warns with the per-effect-size message."""
         from diff_diff.prep import generate_did_data

From a1a522d16723db4a4b3d36fc45cab3bb6f3c8a6a Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 20:36:47 -0400
Subject: [PATCH 3/3] Address CI review: default n_simulation_failures=0 +
 all-failed test

Two items from CI AI review on PR #326:

1. P2 backward-compat: moving `n_simulation_failures` to the end of
   `SimulationPowerResults` with a default of `0`. Users who manually
   instantiate the dataclass with the pre-PR field order continue to
   work; `simulate_power()` still fills the field in via keyword. The
   field remains part of `to_dict()` output (PR-level contract
   unchanged).

2. P3 coverage: adding a regression test for the all-failed escape
   path. An estimator that raises `ValueError` on every replicate now
   asserts both the `RuntimeError("All simulations failed. ...")`
   message and that the narrow-except filter doesn't swallow it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/power.py  |  2 +-
 tests/test_power.py | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/diff_diff/power.py b/diff_diff/power.py
index 750bec3f..6fad48a5 100644
--- a/diff_diff/power.py
+++ b/diff_diff/power.py
@@ -1026,7 +1026,6 @@ class SimulationPowerResults:
     mean_se: float
     coverage: float
     n_simulations: int
-    n_simulation_failures: int
     effect_sizes: List[float]
     powers: List[float]
     true_effect: float
@@ -1039,6 +1038,7 @@ class SimulationPowerResults:
     survey_config: Optional[Any] = field(default=None, repr=False)
     mean_deff: Optional[float] = None
     mean_icc_realized: Optional[float] = None
+    n_simulation_failures: int = 0
 
     def __post_init__(self):
         """Compute derived statistics."""
diff --git a/tests/test_power.py b/tests/test_power.py
index b7107a6f..d4a2c287 100644
--- a/tests/test_power.py
+++ b/tests/test_power.py
@@ -626,6 +626,24 @@ def fit(self, data, **kwargs):
                 data_generator=generate_did_data,
             )
 
+    def test_simulation_all_failed_raises_runtime_error(self):
+        """All simulations failing: narrow-except path still raises RuntimeError."""
+        from diff_diff.prep import generate_did_data
+
+        class AlwaysFailingEstimator:
+            def fit(self, data, **kwargs):
+                raise ValueError("every replicate fails")
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", UserWarning)
+            with pytest.raises(RuntimeError, match="All simulations failed"):
+                simulate_power(
+                    estimator=AlwaysFailingEstimator(),
+                    n_simulations=5,
+                    progress=False,
+                    data_generator=generate_did_data,
+                )
+
     def test_simulation_failure_counter_survives_serialization(self):
         """`n_simulation_failures` round-trips through to_dict/to_dataframe."""
         from diff_diff.prep import generate_did_data