Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions diff_diff/power.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,13 @@ class SimulationPowerResults:
coverage : float
Proportion of CIs containing true effect.
n_simulations : int
Number of simulations performed.
Number of simulations performed (successful count; see
``n_simulation_failures`` for failed-replicate count).
n_simulation_failures : int
Number of simulations at the primary effect size whose `estimator.fit`
(or result extraction) raised an exception and was skipped. Lets
callers programmatically detect fragile DGP/estimator pairings; a
proportional warning is also emitted above a 10% failure rate.
effect_sizes : List[float]
Effect sizes tested (if multiple).
powers : List[float]
Expand Down Expand Up @@ -1032,6 +1038,7 @@ class SimulationPowerResults:
survey_config: Optional[Any] = field(default=None, repr=False)
mean_deff: Optional[float] = None
mean_icc_realized: Optional[float] = None
n_simulation_failures: int = 0

def __post_init__(self):
"""Compute derived statistics."""
Expand Down Expand Up @@ -1062,6 +1069,7 @@ def summary(self) -> str:
"",
f"{'Estimator:':<35} {self.estimator_name}",
f"{'Number of simulations:':<35} {self.n_simulations}",
f"{'Simulation failures:':<35} {self.n_simulation_failures}",
f"{'True treatment effect:':<35} {self.true_effect:.4f}",
f"{'Significance level (alpha):':<35} {self.alpha:.3f}",
"",
Expand Down Expand Up @@ -1130,6 +1138,7 @@ def to_dict(self) -> Dict[str, Any]:
"mean_se": self.mean_se,
"coverage": self.coverage,
"n_simulations": self.n_simulations,
"n_simulation_failures": self.n_simulation_failures,
"true_effect": self.true_effect,
"alpha": self.alpha,
"estimator_name": self.estimator_name,
Expand Down Expand Up @@ -2097,6 +2106,7 @@ def simulate_power(
primary_p_values: List[float] = []
primary_rejections: List[bool] = []
primary_ci_contains: List[bool] = []
primary_n_failures = 0

# Survey DGP truth accumulation (DEFF/ICC are DGP properties,
# independent of effect size, so averaging across all sims is correct)
Expand Down Expand Up @@ -2238,7 +2248,13 @@ def simulate_power(
rejections.append(rejected)
ci_contains_true.append(ci[0] <= effect <= ci[1])

except Exception as e:
except (
ValueError,
np.linalg.LinAlgError,
KeyError,
RuntimeError,
ZeroDivisionError,
) as e:
n_failures += 1
if progress:
print(f" Warning: Simulation {sim} failed: {e}")
Expand Down Expand Up @@ -2266,6 +2282,7 @@ def simulate_power(
primary_p_values = p_values
primary_rejections = rejections
primary_ci_contains = ci_contains_true
primary_n_failures = n_failures

# Compute confidence interval for power (primary effect)
power_val = all_powers[primary_idx]
Expand All @@ -2292,6 +2309,7 @@ def simulate_power(
mean_se=mean_se,
coverage=coverage,
n_simulations=n_valid,
n_simulation_failures=primary_n_failures,
effect_sizes=effect_sizes,
powers=all_powers,
true_effect=primary_effect,
Expand Down
1 change: 1 addition & 0 deletions docs/methodology/REGISTRY.md
Original file line number Diff line number Diff line change
Expand Up @@ -2353,6 +2353,7 @@ n = 2(t_{α/2} + t_{1-κ})² σ² / MDE²
- **Note:** The simulation-based power registry (`simulate_power`, `simulate_mde`, `simulate_sample_size`) uses a single-cohort staggered DGP by default. Estimators configured with `control_group="not_yet_treated"`, `clean_control="strict"`, or `anticipation>0` will receive a `UserWarning` because the default DGP does not match their identification strategy. Users must supply `data_generator_kwargs` (e.g., `cohort_periods=[2, 4]`, `never_treated_frac=0.0`) or a custom `data_generator` to match the estimator design.
- **Note:** The `TripleDifference` registry adapter uses `generate_ddd_data`, a fixed 2×2×2 factorial DGP (group × partition × time). The `n_periods`, `treatment_period`, and `treatment_fraction` parameters are ignored — DDD always simulates 2 periods with balanced groups. `n_units` is mapped to `n_per_cell = max(2, n_units // 8)` (effective total N = `n_per_cell × 8`), so non-multiples of 8 are rounded down and values below 16 are clamped to 16. A `UserWarning` is emitted when simulation inputs differ from the effective DDD design. When rounding occurs, all result objects (`SimulationPowerResults`, `SimulationMDEResults`, `SimulationSampleSizeResults`) set `effective_n_units` to the actual sample size used; it is `None` when no rounding occurred. `simulate_sample_size()` snaps bisection candidates to multiples of 8 so that `required_n` is always a realizable DDD sample size. Passing `n_per_cell` in `data_generator_kwargs` suppresses the effective-N rounding warning but not warnings for ignored parameters (`n_periods`, `treatment_period`, `treatment_fraction`).
- **Note:** The analytical power methods (`PowerAnalysis.power/mde/sample_size` and the `compute_power/compute_mde/compute_sample_size` convenience functions) accept a `deff` parameter (survey design effect, default 1.0). This inflates variance multiplicatively: `Var(ATT) *= deff`, and inflates required sample size: `n_total *= deff`. The `deff` parameter is **not redundant** with `rho` (intra-cluster correlation): `rho` models within-unit serial correlation in panel data via the Moulton factor `1 + (T-1)*rho`, while `deff` models the survey design effect from stratified multi-stage sampling (clustering + unequal weighting). A survey panel study may need both. Values `deff > 0` are accepted; `deff < 1.0` (net variance reduction, e.g., from stratification gain) emits a warning.
- **Note:** `simulate_power()` catches a narrow set of exception types — `ValueError`, `numpy.linalg.LinAlgError`, `KeyError`, `RuntimeError`, `ZeroDivisionError` — raised inside the per-simulation fit and result-extraction block, increments a per-effect failure counter, and skips the replicate. Programming errors (`TypeError`, `AttributeError`, `NameError`, `IndexError`, etc.) are allowed to propagate so that bugs in the estimator or custom result extractor surface loudly instead of being absorbed as simulation failures. The primary-effect failure count is surfaced on the result object as `SimulationPowerResults.n_simulation_failures`; a `UserWarning` still fires when the failure rate exceeds 10% for any effect size, and all-failed runs raise `RuntimeError`. This replaces the prior bare `except Exception` that swallowed root causes and kept the counter internal to the function (axis C — silent fallback — under the Phase 2 audit).
- **Note:** The simulation-based power functions (`simulate_power/simulate_mde/simulate_sample_size`) accept a `survey_config` parameter (`SurveyPowerConfig` dataclass). When set, the simulation loop uses `generate_survey_did_data` instead of the default registry DGP, and automatically injects `SurveyDesign(weights="weight", strata="stratum", psu="psu", fpc="fpc")` into the estimator's `fit()` call. Supported estimators: DifferenceInDifferences, TwoWayFixedEffects, MultiPeriodDiD, CallawaySantAnna, SunAbraham, ImputationDiD, TwoStageDiD, StackedDiD, EfficientDiD. Unsupported (raises `ValueError`): TROP, SyntheticDiD, TripleDifference (generate_survey_did_data produces staggered cohort data incompatible with factor-model/DDD DGPs). `survey_config` and `data_generator` are mutually exclusive. `data_generator_kwargs` may not contain keys managed by `SurveyPowerConfig` (n_strata, psu_per_stratum, etc.) but may contain passthrough DGP params (unit_fe_sd, add_covariates, strata_sizes). Repeated cross-section survey power (`panel=False`) is only supported for `CallawaySantAnna(panel=False)` with a matching `data_generator_kwargs={"panel": False}`; both mismatch directions are rejected. `estimator_kwargs` may not contain `survey_design` when `survey_config` is set (use `SurveyPowerConfig(survey_design=...)` instead). Estimator settings that require a multi-cohort DGP (`control_group="not_yet_treated"`, `control_group="last_cohort"`, `clean_control="strict"`) are rejected because the survey DGP uses a single cohort; use the custom `data_generator` path for these configurations. `simulate_sample_size` raises the bisection floor to `n_strata * psu_per_stratum * 2` to ensure viable survey structure and rejects `strata_sizes` in `data_generator_kwargs` (it depends on `n_units` which varies during bisection).

**Reference implementation(s):**
Expand Down
156 changes: 156 additions & 0 deletions tests/test_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,162 @@ class Result:
# Should have completed successfully without warning
assert len([x for x in w if "simulations" in str(x.message)]) == 0

def test_simulation_failure_counter_on_result(self):
"""`n_simulation_failures` on the result object surfaces the internal counter."""
from diff_diff.prep import generate_did_data

class AlternatingFailingEstimator:
"""Raises ValueError on every other call — ~50% failure rate."""

def __init__(self):
self.call_count = 0

def fit(self, data, **kwargs):
self.call_count += 1
if self.call_count % 2 == 0:
raise ValueError("forced simulated failure")

class Result:
att = 5.0
se = 1.0
p_value = 0.01
conf_int = (3.0, 7.0)

return Result()

estimator = AlternatingFailingEstimator()
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
results = simulate_power(
estimator=estimator,
n_simulations=20,
progress=False,
data_generator=generate_did_data,
)

assert results.n_simulation_failures == 10
assert results.n_simulations == 10
assert results.n_simulation_failures + results.n_simulations == 20

def test_simulation_failure_counter_zero_on_clean_run(self):
"""Clean run: counter is exactly 0, not omitted or None."""
did = DifferenceInDifferences()
results = simulate_power(
estimator=did,
n_units=50,
n_periods=4,
treatment_effect=5.0,
sigma=2.0,
n_simulations=15,
seed=42,
progress=False,
)
assert results.n_simulation_failures == 0

def test_simulation_does_not_swallow_programming_errors(self):
"""`TypeError` (programming error) must propagate, not be absorbed as a failure."""
from diff_diff.prep import generate_did_data

class TypeErrorEstimator:
"""Raises TypeError — a programming bug signal, not a DGP failure."""

def fit(self, data, **kwargs):
raise TypeError("programming bug — must propagate")

with pytest.raises(TypeError, match="programming bug"):
simulate_power(
estimator=TypeErrorEstimator(),
n_simulations=5,
progress=False,
data_generator=generate_did_data,
)

def test_simulation_all_failed_raises_runtime_error(self):
"""All simulations failing: narrow-except path still raises RuntimeError."""
from diff_diff.prep import generate_did_data

class AlwaysFailingEstimator:
def fit(self, data, **kwargs):
raise ValueError("every replicate fails")

with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
with pytest.raises(RuntimeError, match="All simulations failed"):
simulate_power(
estimator=AlwaysFailingEstimator(),
n_simulations=5,
progress=False,
data_generator=generate_did_data,
)

def test_simulation_failure_counter_survives_serialization(self):
"""`n_simulation_failures` round-trips through to_dict/to_dataframe."""
from diff_diff.prep import generate_did_data

class AlternatingFailingEstimator:
def __init__(self):
self.call_count = 0

def fit(self, data, **kwargs):
self.call_count += 1
if self.call_count % 2 == 0:
raise ValueError("forced simulated failure")

class Result:
att = 5.0
se = 1.0
p_value = 0.01
conf_int = (3.0, 7.0)

return Result()

with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
results = simulate_power(
estimator=AlternatingFailingEstimator(),
n_simulations=20,
progress=False,
data_generator=generate_did_data,
)

serialized = results.to_dict()
assert "n_simulation_failures" in serialized
assert serialized["n_simulation_failures"] == results.n_simulation_failures == 10

def test_simulation_failure_rate_warning_above_threshold(self):
"""10% threshold: >10% failure still warns with the per-effect-size message."""
from diff_diff.prep import generate_did_data

class MostlyFailingEstimator:
"""Fails 16/20 calls (80% failure rate) — triggers warning."""

def __init__(self):
self.call_count = 0

def fit(self, data, **kwargs):
self.call_count += 1
if self.call_count % 5 != 0:
raise ValueError("forced failure")

class Result:
att = 5.0
se = 1.0
p_value = 0.01
conf_int = (3.0, 7.0)

return Result()

with pytest.warns(UserWarning, match=r"simulations .* failed for effect_size="):
results = simulate_power(
estimator=MostlyFailingEstimator(),
n_simulations=20,
progress=False,
data_generator=generate_did_data,
)

assert results.n_simulation_failures == 16
assert results.n_simulations == 4


class TestVisualization:
"""Tests for power curve visualization."""
Expand Down
Loading