From 8b1914c492af698dd99b4ae1a0eb351825f5edfc Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 19:35:16 -0400 Subject: [PATCH 1/3] Surface PowerAnalysis simulation-failure count and narrow except clause MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two silent-failure patterns at `power.py:2241` addressed together: 1. Bare `except Exception` absorbed every exception a user-supplied estimator or result extractor could raise, including programming errors (TypeError, AttributeError, IndexError, ...). Those masked bugs counted as "simulation failures" and were aggregated as a reliability signal instead of propagating. Narrowed the catch to `(ValueError, np.linalg.LinAlgError, KeyError, RuntimeError, ZeroDivisionError)` — the set that reasonable DGPs and fit paths can raise on adversarial samples. 2. The internal `n_failures` counter was per-effect-size and thrown away after the outer loop, leaving users with no programmatic way to check whether a run was clean. Surface the primary-effect failure count on the results object as `SimulationPowerResults.n_simulation_failures` and include it in `summary()` output. The proportional > 10% `UserWarning` and the raise-if-all-fail escape are preserved. Covered by audit axis C (silent fallback). Finding #11 from `docs/audits/silent-failures-findings.md`. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/power.py | 21 ++++++- docs/methodology/REGISTRY.md | 1 + tests/test_power.py | 104 +++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 2 deletions(-) diff --git a/diff_diff/power.py b/diff_diff/power.py index 87959b0a..a464b90b 100644 --- a/diff_diff/power.py +++ b/diff_diff/power.py @@ -995,7 +995,13 @@ class SimulationPowerResults: coverage : float Proportion of CIs containing true effect. n_simulations : int - Number of simulations performed. + Number of simulations performed (successful count; see + ``n_simulation_failures`` for failed-replicate count). + n_simulation_failures : int + Number of simulations at the primary effect size whose `estimator.fit` + (or result extraction) raised an exception and was skipped. Lets + callers programmatically detect fragile DGP/estimator pairings; a + proportional warning is also emitted above a 10% failure rate. effect_sizes : List[float] Effect sizes tested (if multiple). powers : List[float] @@ -1020,6 +1026,7 @@ class SimulationPowerResults: mean_se: float coverage: float n_simulations: int + n_simulation_failures: int effect_sizes: List[float] powers: List[float] true_effect: float @@ -1062,6 +1069,7 @@ def summary(self) -> str: "", f"{'Estimator:':<35} {self.estimator_name}", f"{'Number of simulations:':<35} {self.n_simulations}", + f"{'Simulation failures:':<35} {self.n_simulation_failures}", f"{'True treatment effect:':<35} {self.true_effect:.4f}", f"{'Significance level (alpha):':<35} {self.alpha:.3f}", "", @@ -2097,6 +2105,7 @@ def simulate_power( primary_p_values: List[float] = [] primary_rejections: List[bool] = [] primary_ci_contains: List[bool] = [] + primary_n_failures = 0 # Survey DGP truth accumulation (DEFF/ICC are DGP properties, # independent of effect size, so averaging across all sims is correct) @@ -2238,7 +2247,13 @@ def simulate_power( rejections.append(rejected) ci_contains_true.append(ci[0] <= effect <= ci[1]) - except Exception as e: + except ( + ValueError, + np.linalg.LinAlgError, + KeyError, + RuntimeError, + ZeroDivisionError, + ) as e: n_failures += 1 if progress: print(f" Warning: Simulation {sim} failed: {e}") @@ -2266,6 +2281,7 @@ def simulate_power( primary_p_values = p_values primary_rejections = rejections primary_ci_contains = ci_contains_true + primary_n_failures = n_failures # Compute confidence interval for power (primary effect) power_val = all_powers[primary_idx] @@ -2292,6 +2308,7 @@ def simulate_power( mean_se=mean_se, coverage=coverage, n_simulations=n_valid, + n_simulation_failures=primary_n_failures, effect_sizes=effect_sizes, powers=all_powers, true_effect=primary_effect, diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index a7f11455..27f2a419 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -2353,6 +2353,7 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE² - **Note:** The simulation-based power registry (`simulate_power`, `simulate_mde`, `simulate_sample_size`) uses a single-cohort staggered DGP by default. Estimators configured with `control_group="not_yet_treated"`, `clean_control="strict"`, or `anticipation>0` will receive a `UserWarning` because the default DGP does not match their identification strategy. Users must supply `data_generator_kwargs` (e.g., `cohort_periods=[2, 4]`, `never_treated_frac=0.0`) or a custom `data_generator` to match the estimator design. - **Note:** The `TripleDifference` registry adapter uses `generate_ddd_data`, a fixed 2×2×2 factorial DGP (group × partition × time). The `n_periods`, `treatment_period`, and `treatment_fraction` parameters are ignored — DDD always simulates 2 periods with balanced groups. `n_units` is mapped to `n_per_cell = max(2, n_units // 8)` (effective total N = `n_per_cell × 8`), so non-multiples of 8 are rounded down and values below 16 are clamped to 16. A `UserWarning` is emitted when simulation inputs differ from the effective DDD design. When rounding occurs, all result objects (`SimulationPowerResults`, `SimulationMDEResults`, `SimulationSampleSizeResults`) set `effective_n_units` to the actual sample size used; it is `None` when no rounding occurred. `simulate_sample_size()` snaps bisection candidates to multiples of 8 so that `required_n` is always a realizable DDD sample size. Passing `n_per_cell` in `data_generator_kwargs` suppresses the effective-N rounding warning but not warnings for ignored parameters (`n_periods`, `treatment_period`, `treatment_fraction`). - **Note:** The analytical power methods (`PowerAnalysis.power/mde/sample_size` and the `compute_power/compute_mde/compute_sample_size` convenience functions) accept a `deff` parameter (survey design effect, default 1.0). This inflates variance multiplicatively: `Var(ATT) *= deff`, and inflates required sample size: `n_total *= deff`. The `deff` parameter is **not redundant** with `rho` (intra-cluster correlation): `rho` models within-unit serial correlation in panel data via the Moulton factor `1 + (T-1)*rho`, while `deff` models the survey design effect from stratified multi-stage sampling (clustering + unequal weighting). A survey panel study may need both. Values `deff > 0` are accepted; `deff < 1.0` (net variance reduction, e.g., from stratification gain) emits a warning. +- **Note:** `simulate_power()` catches a narrow set of exception types — `ValueError`, `numpy.linalg.LinAlgError`, `KeyError`, `RuntimeError`, `ZeroDivisionError` — raised inside the per-simulation fit and result-extraction block, increments a per-effect failure counter, and skips the replicate. Programming errors (`TypeError`, `AttributeError`, `NameError`, `IndexError`, etc.) are allowed to propagate so that bugs in the estimator or custom result extractor surface loudly instead of being absorbed as simulation failures. The primary-effect failure count is surfaced on the result object as `SimulationPowerResults.n_simulation_failures`; a `UserWarning` still fires when the failure rate exceeds 10% for any effect size, and all-failed runs raise `RuntimeError`. This replaces the prior bare `except Exception` that swallowed root causes and kept the counter internal to the function (axis C — silent fallback — under the Phase 2 audit). - **Note:** The simulation-based power functions (`simulate_power/simulate_mde/simulate_sample_size`) accept a `survey_config` parameter (`SurveyPowerConfig` dataclass). When set, the simulation loop uses `generate_survey_did_data` instead of the default registry DGP, and automatically injects `SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc")` into the estimator's `fit()` call. Supported estimators: DifferenceInDifferences, TwoWayFixedEffects, MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD, TwoStageDiD, StackedDiD, EfficientDiD. Unsupported (raises `ValueError`): TROP, SyntheticDiD, TripleDifference (generate_survey_did_data produces staggered cohort data incompatible with factor-model/DDD DGPs). `survey_config` and `data_generator` are mutually exclusive. `data_generator_kwargs` may not contain keys managed by `SurveyPowerConfig` (n_strata, psu_per_stratum, etc.) but may contain passthrough DGP params (unit_fe_sd, add_covariates, strata_sizes). Repeated cross-section survey power (`panel=False`) is only supported for `CallawaySantAnna(panel=False)` with a matching `data_generator_kwargs={"panel": False}`; both mismatch directions are rejected. `estimator_kwargs` may not contain `survey_design` when `survey_config` is set (use `SurveyPowerConfig(survey_design=...)` instead). Estimator settings that require a multi-cohort DGP (`control_group="not_yet_treated"`, `control_group="last_cohort"`, `clean_control="strict"`) are rejected because the survey DGP uses a single cohort; use the custom `data_generator` path for these configurations. `simulate_sample_size` raises the bisection floor to `n_strata * psu_per_stratum * 2` to ensure viable survey structure and rejects `strata_sizes` in `data_generator_kwargs` (it depends on `n_units` which varies during bisection). **Reference implementation(s):** diff --git a/tests/test_power.py b/tests/test_power.py index 793f56c9..e3ab3622 100644 --- a/tests/test_power.py +++ b/tests/test_power.py @@ -556,6 +556,110 @@ class Result: # Should have completed successfully without warning assert len([x for x in w if "simulations" in str(x.message)]) == 0 + def test_simulation_failure_counter_on_result(self): + """`n_simulation_failures` on the result object surfaces the internal counter.""" + from diff_diff.prep import generate_did_data + + class AlternatingFailingEstimator: + """Raises ValueError on every other call — ~50% failure rate.""" + + def __init__(self): + self.call_count = 0 + + def fit(self, data, **kwargs): + self.call_count += 1 + if self.call_count % 2 == 0: + raise ValueError("forced simulated failure") + + class Result: + att = 5.0 + se = 1.0 + p_value = 0.01 + conf_int = (3.0, 7.0) + + return Result() + + estimator = AlternatingFailingEstimator() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + results = simulate_power( + estimator=estimator, + n_simulations=20, + progress=False, + data_generator=generate_did_data, + ) + + assert results.n_simulation_failures == 10 + assert results.n_simulations == 10 + assert results.n_simulation_failures + results.n_simulations == 20 + + def test_simulation_failure_counter_zero_on_clean_run(self): + """Clean run: counter is exactly 0, not omitted or None.""" + did = DifferenceInDifferences() + results = simulate_power( + estimator=did, + n_units=50, + n_periods=4, + treatment_effect=5.0, + sigma=2.0, + n_simulations=15, + seed=42, + progress=False, + ) + assert results.n_simulation_failures == 0 + + def test_simulation_does_not_swallow_programming_errors(self): + """`TypeError` (programming error) must propagate, not be absorbed as a failure.""" + from diff_diff.prep import generate_did_data + + class TypeErrorEstimator: + """Raises TypeError — a programming bug signal, not a DGP failure.""" + + def fit(self, data, **kwargs): + raise TypeError("programming bug — must propagate") + + with pytest.raises(TypeError, match="programming bug"): + simulate_power( + estimator=TypeErrorEstimator(), + n_simulations=5, + progress=False, + data_generator=generate_did_data, + ) + + def test_simulation_failure_rate_warning_above_threshold(self): + """10% threshold: >10% failure still warns with the per-effect-size message.""" + from diff_diff.prep import generate_did_data + + class MostlyFailingEstimator: + """Fails 16/20 calls (80% failure rate) — triggers warning.""" + + def __init__(self): + self.call_count = 0 + + def fit(self, data, **kwargs): + self.call_count += 1 + if self.call_count % 5 != 0: + raise ValueError("forced failure") + + class Result: + att = 5.0 + se = 1.0 + p_value = 0.01 + conf_int = (3.0, 7.0) + + return Result() + + with pytest.warns(UserWarning, match=r"simulations .* failed for effect_size="): + results = simulate_power( + estimator=MostlyFailingEstimator(), + n_simulations=20, + progress=False, + data_generator=generate_did_data, + ) + + assert results.n_simulation_failures == 16 + assert results.n_simulations == 4 + class TestVisualization: """Tests for power curve visualization.""" From af56950caafd496a5675963505c5c47cb5eb7f28 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 19:36:29 -0400 Subject: [PATCH 2/3] Address local AI review: include n_simulation_failures in to_dict() The two P2 findings from local AI review were one issue: the new `n_simulation_failures` field was added to the dataclass and `summary()` but not to `to_dict()`. That left a gap between the in-memory result and the JSON/DataFrame serialization path used by notebooks and pipelines. - Add `n_simulation_failures` to `SimulationPowerResults.to_dict()`. - Add a regression test asserting the field round-trips through `to_dict()` after a partially-failing run. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/power.py | 1 + tests/test_power.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/diff_diff/power.py b/diff_diff/power.py index a464b90b..750bec3f 100644 --- a/diff_diff/power.py +++ b/diff_diff/power.py @@ -1138,6 +1138,7 @@ def to_dict(self) -> Dict[str, Any]: "mean_se": self.mean_se, "coverage": self.coverage, "n_simulations": self.n_simulations, + "n_simulation_failures": self.n_simulation_failures, "true_effect": self.true_effect, "alpha": self.alpha, "estimator_name": self.estimator_name, diff --git a/tests/test_power.py b/tests/test_power.py index e3ab3622..b7107a6f 100644 --- a/tests/test_power.py +++ b/tests/test_power.py @@ -626,6 +626,40 @@ def fit(self, data, **kwargs): data_generator=generate_did_data, ) + def test_simulation_failure_counter_survives_serialization(self): + """`n_simulation_failures` round-trips through to_dict/to_dataframe.""" + from diff_diff.prep import generate_did_data + + class AlternatingFailingEstimator: + def __init__(self): + self.call_count = 0 + + def fit(self, data, **kwargs): + self.call_count += 1 + if self.call_count % 2 == 0: + raise ValueError("forced simulated failure") + + class Result: + att = 5.0 + se = 1.0 + p_value = 0.01 + conf_int = (3.0, 7.0) + + return Result() + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + results = simulate_power( + estimator=AlternatingFailingEstimator(), + n_simulations=20, + progress=False, + data_generator=generate_did_data, + ) + + serialized = results.to_dict() + assert "n_simulation_failures" in serialized + assert serialized["n_simulation_failures"] == results.n_simulation_failures == 10 + def test_simulation_failure_rate_warning_above_threshold(self): """10% threshold: >10% failure still warns with the per-effect-size message.""" from diff_diff.prep import generate_did_data From a1a522d16723db4a4b3d36fc45cab3bb6f3c8a6a Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 20:36:47 -0400 Subject: [PATCH 3/3] Address CI review: default n_simulation_failures=0 + all-failed test Two items from CI AI review on PR #326: 1. P2 backward-compat: moving `n_simulation_failures` to the end of `SimulationPowerResults` with a default of `0`. Users who manually instantiate the dataclass with the pre-PR field order continue to work; `simulate_power()` still fills the field in via keyword. The field remains part of `to_dict()` output (PR-level contract unchanged). 2. P3 coverage: adding a regression test for the all-failed escape path. An estimator that raises `ValueError` on every replicate now asserts both the `RuntimeError("All simulations failed. ...")` message and that the narrow-except filter doesn't swallow it. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/power.py | 2 +- tests/test_power.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/diff_diff/power.py b/diff_diff/power.py index 750bec3f..6fad48a5 100644 --- a/diff_diff/power.py +++ b/diff_diff/power.py @@ -1026,7 +1026,6 @@ class SimulationPowerResults: mean_se: float coverage: float n_simulations: int - n_simulation_failures: int effect_sizes: List[float] powers: List[float] true_effect: float @@ -1039,6 +1038,7 @@ class SimulationPowerResults: survey_config: Optional[Any] = field(default=None, repr=False) mean_deff: Optional[float] = None mean_icc_realized: Optional[float] = None + n_simulation_failures: int = 0 def __post_init__(self): """Compute derived statistics.""" diff --git a/tests/test_power.py b/tests/test_power.py index b7107a6f..d4a2c287 100644 --- a/tests/test_power.py +++ b/tests/test_power.py @@ -626,6 +626,24 @@ def fit(self, data, **kwargs): data_generator=generate_did_data, ) + def test_simulation_all_failed_raises_runtime_error(self): + """All simulations failing: narrow-except path still raises RuntimeError.""" + from diff_diff.prep import generate_did_data + + class AlwaysFailingEstimator: + def fit(self, data, **kwargs): + raise ValueError("every replicate fails") + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + with pytest.raises(RuntimeError, match="All simulations failed"): + simulate_power( + estimator=AlwaysFailingEstimator(), + n_simulations=5, + progress=False, + data_generator=generate_did_data, + ) + def test_simulation_failure_counter_survives_serialization(self): """`n_simulation_failures` round-trips through to_dict/to_dataframe.""" from diff_diff.prep import generate_did_data