From 73ef44cd927ad947f8d81cfe5cb6ce06fb6cfe88 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 19:06:15 -0400 Subject: [PATCH 1/3] Add BR/DR canonical-dataset validation + two wording fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes BR/DR foundation gap #4 (real-dataset validation) from the external-positioning gap list in ``project_br_dr_foundation.md``. Validation artifact: - ``docs/validation/validate_br_dr_canonical.py`` runs BusinessReport / DiagnosticReport on Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 benchmark), and Castle Doctrine (Cheng-Hoekstra 2013 under both CS and SA), dumping summary + full_report + selected to_dict blocks for each. - ``docs/validation/br_dr_canonical_validation.md`` is the regenerable raw output. - ``docs/validation/br_dr_canonical_findings.md`` is the hand-written synthesis: direction / verdict / sensitivity tier all match canonical interpretations, with two small wording bugs surfaced and fixed in this PR and two larger gaps queued as follow-up (SA HonestDiD applicability, target-parameter disambiguation). Wording fixes: 1. Treatment-label capitalization. ``str.capitalize()`` lowercased every character after the first, flattening embedded abbreviations (``"the NJ minimum-wage increase"`` → ``"The nj minimum-wage increase"``) and proper-noun phrases (``"Castle Doctrine law adoption"`` → ``"Castle doctrine law adoption"``). Replaced with a ``_sentence_first_upper`` helper that preserves user-supplied casing. 2. ``breakdown_M == 0`` phrasing. The HonestDiD fragile sentence quoted ``{breakdown_M:.2g}x the pre-period variation``, which renders as a degenerate ``0x`` on the exact-zero case surfaced by Cheng-Hoekstra. At ``breakdown_M <= 0.05`` (covers 0 and near-zero values), both BR's summary and DR's overall_interpretation now say "includes zero even at the smallest parallel-trends violations on the sensitivity grid" instead. Tests: 5 new regressions in ``TestCanonicalValidationSurfaceFixes`` covering both fixes + three boundary cases (exact zero, small positive, normal fragile value). Not in scope: Favara-Imbs (dCDH reversible-treatment dataset not bundled), ImputationDiD / TwoStageDiD on canonical data (needed to exercise the R42 untreated-outcome FE assumption branch on real data), SA HonestDiD applicability gap. All tracked in the findings doc for follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 54 +- diff_diff/diagnostic_report.py | 31 +- docs/validation/br_dr_canonical_findings.md | 175 +++++ docs/validation/br_dr_canonical_validation.md | 723 ++++++++++++++++++ docs/validation/validate_br_dr_canonical.py | 308 ++++++++ tests/test_business_report.py | 168 ++++ 6 files changed, 1446 insertions(+), 13 deletions(-) create mode 100644 docs/validation/br_dr_canonical_findings.md create mode 100644 docs/validation/br_dr_canonical_validation.md create mode 100644 docs/validation/validate_br_dr_canonical.py diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 2445251f..bbbeadc7 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1854,6 +1854,24 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str: return "the confidence interval includes zero; the data are consistent with no effect" +def _sentence_first_upper(text: str) -> str: + """Uppercase only the first character of ``text``, preserving all + other casing. Unlike ``str.capitalize()``, which lowercases every + character after the first, this keeps user-supplied abbreviations + and proper nouns intact. + + Examples + -------- + >>> _sentence_first_upper("the NJ minimum-wage increase") + 'The NJ minimum-wage increase' + >>> _sentence_first_upper("Castle Doctrine law adoption") + 'Castle Doctrine law adoption' + """ + if not text: + return text + return text[0].upper() + text[1:] + + def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str: """Return a direction-aware verb for the headline sentence. @@ -1929,7 +1947,16 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str: # is not actually available. ci_str = " (inference unavailable: confidence interval is undefined for this fit)" by_clause = f" by {magnitude}" if effect != 0 else "" - return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}." + # Round-1 BR/DR canonical-validation (2026-04-19): Python's + # ``str.capitalize()`` lowercases everything except the first + # character, so ``"the NJ minimum-wage increase".capitalize()`` + # returns ``"The nj minimum-wage increase"`` — flattening the + # ``NJ`` abbreviation. Real canonical datasets (Card-Krueger, + # Castle Doctrine) carry proper-noun / acronym tokens in the + # user-supplied ``treatment_label``, so preserve user casing and + # only ensure the first character is uppercase. + treatment_sentence = _sentence_first_upper(treatment) + return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}." def _render_summary(schema: Dict[str, Any]) -> str: @@ -2088,11 +2115,26 @@ def _render_summary(schema: Dict[str, Any]) -> str: f"pre-period variation." ) elif isinstance(bkd, (int, float)): - sentences.append( - f"HonestDiD: the result is fragile — the confidence interval " - f"includes zero once violations reach {bkd:.2g}x the " - f"pre-period variation." - ) + # Round-1 BR/DR canonical-validation (2026-04-19): + # ``breakdown_M`` at or near zero reads as "0x the + # pre-period variation" which is a degenerate sentence + # (zero-times-anything is zero). The correct wording when + # the CI includes zero at the smallest grid point is to + # say the result is fragile to essentially any nonzero + # violation, not to quote the ``0x`` multiplier. + if bkd <= 0.05: + sentences.append( + "HonestDiD: the result is fragile — the confidence " + "interval includes zero even at the smallest " + "parallel-trends violations on the sensitivity " + "grid." + ) + else: + sentences.append( + f"HonestDiD: the result is fragile — the confidence " + f"interval includes zero once violations reach {bkd:.2g}x " + f"the pre-period variation." + ) # Sample sentence. For fits with a dynamic comparison set (CS / # ContinuousDiD / StaggeredTripleDiff / EfficientDiD / diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 0fe798a9..21ce917d 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -3118,13 +3118,30 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str f"pre-period variation." ) else: - sentences.append( - f"HonestDiD sensitivity: the result is fragile — the " - f"confidence interval includes zero once violations reach " - f"{bkd:.2g}x the pre-period variation." - if isinstance(bkd, (int, float)) - else "" - ) + # Round-1 BR/DR canonical-validation (2026-04-19): the + # "fragile — CI includes zero once violations reach 0x + # the pre-period variation" wording is a degenerate + # sentence at the ``breakdown_M == 0`` edge case + # surfaced by the Cheng-Hoekstra (2013) Castle Doctrine + # dataset. Mirror BR's fix: when the breakdown value is + # at or near zero, say the CI includes zero at the + # smallest grid point rather than quoting a ``0x`` + # multiplier. + if isinstance(bkd, (int, float)): + if bkd <= 0.05: + sentences.append( + "HonestDiD sensitivity: the result is fragile — " + "the confidence interval includes zero even at " + "the smallest parallel-trends violations on the " + "sensitivity grid." + ) + else: + sentences.append( + f"HonestDiD sensitivity: the result is fragile — " + f"the confidence interval includes zero once " + f"violations reach {bkd:.2g}x the pre-period " + f"variation." + ) # Sentence 4: one secondary caveat if present. bacon = schema.get("bacon") or {} diff --git a/docs/validation/br_dr_canonical_findings.md b/docs/validation/br_dr_canonical_findings.md new file mode 100644 index 00000000..7f369573 --- /dev/null +++ b/docs/validation/br_dr_canonical_findings.md @@ -0,0 +1,175 @@ +# BR / DR canonical-dataset validation findings + +This file records divergences observed in +``br_dr_canonical_validation.md`` against canonical literature +interpretations. Generated by running +``docs/validation/validate_br_dr_canonical.py`` on the bundled +datasets (Card-Krueger 1994, Callaway-Sant'Anna mpdta benchmark, +Castle Doctrine / Cheng-Hoekstra 2013). This closes BR/DR +foundation gap #4 — real-dataset validation — from the +external-positioning gap list in +``project_br_dr_foundation.md``. + +The goal of the validation exercise is to stress-test BR's prose on +fits that published applied work has already interpreted, not to +exactly reproduce their point estimates (the bundled datasets are +either the R `did` package simulated benchmark or the causaldata +mirrors, which may differ from the original author data). + +## Headline assessment + +BR's prose direction, verdicts, and caveat framing match canonical +interpretations across all four runs: + +- **Card-Krueger**: positive sign, CI includes zero, "data consistent + with no effect." Matches the famous Card-Krueger finding of no + disemployment. +- **mpdta (CS)**: aggregate ATT negative (-0.021 log-points), pre-trends + `no_detected_violation`, HonestDiD `robust_to_M_1.28`. Matches CS + tutorial expectations that the fit is robust. +- **Castle Doctrine (CS)**: positive sign (homicides went up), pre-trends + `clear_violation` (joint p = 0.003), HonestDiD `fragile` + (breakdown_M = 0). Matches Cheng-Hoekstra's escalation finding AND + correctly flags the identifying-assumption fragility the staggered + rollout produces. +- **Castle Doctrine (SA)**: identical point estimates (as expected — + CS and SA are algebraically consistent on this data), same clear PT + violation verdict. + +No wrong-sign or wrong-verdict findings surfaced on any of the four +runs. The Bacon "already-robust" framing lifted from round-45 reads +correctly on the staggered fits (CS and SA on Castle Doctrine and +mpdta): the caveat is scoped as a statement about the rollout +design, not a switch-estimator recommendation. + +## Issues fixed in this PR + +Small prose bugs surfaced by the real-data output. Each is a wording +fix, not a methodology defect. All three are regression-tested under +``tests/test_business_report.py::TestCanonicalValidationSurfaceFixes``. + +### Issue 1 (FIXED): Treatment label first-word capitalization eats abbreviations + +Card-Krueger output: + +> The **nj** minimum-wage increase lifted FTE employment by 1.47 FTE … + +Castle Doctrine output (CS and SA): + +> **Castle doctrine** law adoption worsened Homicide rate (per 100k) … + +BR used ``str.capitalize()``, which lowercases every character after +the first. For labels starting with an abbreviation (``"the NJ +minimum-wage increase"``) or a proper-noun phrase (``"Castle Doctrine +law adoption"``), this flattened case in a way that looked wrong in +stakeholder-facing prose. + +**Fix**: replaced ``str.capitalize()`` with a new +``_sentence_first_upper`` helper that uppercases only the first +character and preserves user-supplied casing for everything else. + +### Issue 2 (FIXED): ``breakdown_M = 0`` phrasing reads as "0x" (zero-times-something) + +Castle Doctrine (CS): + +> HonestDiD: the result is fragile — the confidence interval +> includes zero once violations reach **0x** the pre-period +> variation. + +When the breakdown M is exactly 0, "reach 0x the pre-period +variation" is a degenerate reading — the CI already includes zero +under any nonzero pre-trend violation (or even with zero violation, +depending on the grid). + +**Fix**: when ``breakdown_M <= 0.05``, both BR's summary and DR's +overall-interpretation sentence emit "the confidence interval +includes zero even at the smallest parallel-trends violations on +the sensitivity grid" instead of quoting the ``0x`` multiplier. The +0.05 threshold also covers near-zero values (e.g., 0.03) where the +multiplier is equally uninformative to stakeholders. + +### Issue 3 (deferred): Outcome-label capitalization in mid-sentence + +mpdta output: + +> … reduced **Log employment** by 0.0214 log-points … + +The user-supplied ``outcome_label="Log employment"`` is +capitalized as-is, which looks awkward mid-sentence. This is +stylistic, and arguably user-controllable (the user could pass +``outcome_label="log employment"``). Deprioritize unless fixing +Issue 1 is trivially extensible. Noted here for follow-up. + +## Issues to track as follow-up (out-of-scope for this PR) + +### Follow-up A: ``SunAbrahamResults`` excluded from HonestDiD applicability + +BR's applicability matrix (``diagnostic_report.py`` line ~107) lists +``SunAbrahamResults`` with ``{parallel_trends, pretrends_power, bacon, +design_effect, heterogeneity}`` — NO ``sensitivity``. But the +original plan's applicability matrix in +``project_br_dr_foundation.md`` and the SA methodology surface +(event-study coefficients + VCov) both support HonestDiD in principle. + +Observed on Castle Doctrine (SA): + +> ## Sensitivity (HonestDiD) +> +> - Sensitivity not computed: sensitivity is not applicable to +> SunAbrahamResults. + +Given SA shows the same PT violation that CS does on this dataset, +not having HonestDiD sensitivity on SA is a real usability gap. This +requires adding an SA adapter to ``compute_honest_did`` and expanding +the applicability matrix; it is library work beyond BR/DR prose. +Belongs in the BR/DR gap-list expansion. + +### Follow-up B: Target-parameter clarity (gap list item #6) + +The assumption block on every staggered fit still reads: + +> Identification relies on parallel trends across treatment cohorts +> and time periods (group-time ATT), plus no anticipation. + +But the CS ``overall_att`` for mpdta is a specific weighted average +of ``ATT(g, t)`` cells, SA is an IW average, Stacked is a +sub-experiment-weighted average, dCDH is a switchers average. BR's +headline reports a single number without disambiguating the +estimand. For Baker et al. (2025) practitioner-guide parity, the +assumption block should carry the target-parameter clause. +Already tracked as gap #6. + +### Follow-up C: Card-Krueger effect size differs from published ATT + +Our bundled ``load_card_krueger`` returns an ATT of +1.47 FTE; the +published Card-Krueger ATT is ~+0.59 FTE. The direction and +CI-includes-zero verdict match canonical, but the magnitude does +not. This is a ``datasets.py`` data-loading question (the +causaldata mirror may aggregate differently than the original +author sample), not a BR prose bug. Noted here so a future +data-validation PR can address it upstream. + +## What was validated (summary) + +- End-to-end BR / DR flow runs without errors on 4 canonical datasets. +- Direction of the effect matches canonical interpretation on all 4. +- Pre-trends verdict tier (no_detected_violation / clear_violation) + matches the literature's reading. +- HonestDiD sensitivity tier (robust vs fragile) matches. +- Bacon "already-robust" framing from round-45 reads correctly on + real staggered data. +- The identifying-assumption source-faithfulness retags from + round-42 (BJS / Gardner untreated-outcome FE model) did not + surface on these runs because none of the datasets was run + through ImputationDiD or TwoStageDiD — follow-up validation + should add those. + +## Regeneration + +```bash +python docs/validation/validate_br_dr_canonical.py +``` + +The script writes ``br_dr_canonical_validation.md`` (the raw output +artifact); this file is the findings synthesis and is written by +hand from that artifact. diff --git a/docs/validation/br_dr_canonical_validation.md b/docs/validation/br_dr_canonical_validation.md new file mode 100644 index 00000000..99bb5af0 --- /dev/null +++ b/docs/validation/br_dr_canonical_validation.md @@ -0,0 +1,723 @@ +# BR / DR canonical-dataset validation + +Output of ``docs/validation/validate_br_dr_canonical.py``. Each section runs BusinessReport (and its auto-constructed DiagnosticReport) on a canonical DiD dataset and dumps summary + full_report + selected to_dict blocks. The purpose is to compare BR's prose output against published canonical interpretations and record divergences in ``br_dr_canonical_findings.md``. + +This file is regenerable; do not hand-edit. + +Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA). + +--- + +## Card & Krueger (1994): NJ/PA minimum wage +Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA (control, $4.25 throughout). Outcome: full-time equivalent employment. N=310 stores. + +Canonical interpretation: no significant disemployment effect of the minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). The famous finding was that the CI included zero. + +### BusinessReport.summary() +``` +Question: Did the NJ minimum-wage increase reduce fast-food employment? The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE). Statistically, the confidence interval includes zero; the data are consistent with no effect. Sample: 620 observations (462 treated, 158 control). +``` +### BusinessReport.full_report() +```markdown +# Business Report: FTE employment + +**Question**: Did the NJ minimum-wage increase reduce fast-food employment? + +**Estimator**: `DiDResults` + +## Headline + +The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE). + +Statistically, the confidence interval includes zero; the data are consistent with no effect. + +## Identifying Assumption + +Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group. + +## Pre-Trends + +- Pre-trends not computed: auto_diagnostics=False + +## Sensitivity (HonestDiD) + +- Sensitivity not computed: auto_diagnostics=False + +## Sample + +- Observations: 620 +- Treated: 462 +- Control: 158 + +## References + +- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. +- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. + + +## Technical Appendix + +``` +====================================================================== + Difference-in-Differences Estimation Results +====================================================================== + +Observations: 620 +Treated: 462 +Control: 158 +R-squared: 0.0036 +Variance: HC1 heteroskedasticity-robust + +---------------------------------------------------------------------- +Parameter Estimate Std. Err. t-stat P>|t| +---------------------------------------------------------------------- +ATT 1.4718 1.9320 0.762 0.4465 +---------------------------------------------------------------------- + +95% Confidence Interval: [-2.3224, 5.2660] +CV (SE/|ATT|): 1.3127 + +Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 +====================================================================== +``` +``` +### BusinessReport.to_dict() - headline + assumption + caveats +```json +{ + "effect": 1.4718176338428604, + "se": 1.9320362599811534, + "ci_lower": -2.322358689575049, + "ci_upper": 5.26599395726077, + "alpha_was_honored": true, + "alpha_override_caveat": null, + "ci_level": 95, + "p_value": 0.4464732839915416, + "is_significant": false, + "near_significance_threshold": false, + "unit": "FTE", + "unit_kind": "unknown", + "sign": "positive", + "breakdown_M": null +} +``` +```json +{ + "parallel_trends_variant": "unconditional", + "no_anticipation": true, + "description": "Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group." +} +``` +```json +[] +``` + +--- +## Callaway-Sant'Anna benchmark (mpdta) +Data: simulated county-level panel from R `did` package (Callaway & Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: log employment (`lemp`). + +Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) on treated counties; group-specific ATT(g,t) negative across cohorts. See CS (2021) Figures 1-2. + +### BusinessReport.summary() +``` +Question: Did minimum-wage increases reduce county employment? The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients do not reject parallel trends; the test is moderately informative. See the sensitivity analysis below for bounded-violation guarantees. HonestDiD: the result remains significant under parallel-trends violations up to 1.3x the observed pre-period variation. Sample: 2,500 observations (309 treated, 191 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. +``` +### BusinessReport.full_report() +```markdown +# Business Report: Log employment + +**Question**: Did minimum-wage increases reduce county employment? + +**Estimator**: `CallawaySantAnnaResults` + +## Headline + +The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points). + +Statistically, the direction of the effect is strongly supported by the data. + +## Identifying Assumption + +Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation. + +## Pre-Trends + +- Verdict: `no_detected_violation` (joint p = 0.482) +- Power tier: `moderately_powered` +- Minimum detectable violation (MDV): 0.0105 +- MDV / |ATT|: 0.49 + +## Sensitivity (HonestDiD) + +- Method: `relative_magnitude` +- Breakdown M: 1.28 +- Conclusion: `robust_to_M_1.28` + +## Sample + +- Observations: 2,500 +- Treated: 309 +- Control: 191 + +## Heterogeneity + +- Source: `event_study_effects_post` +- N effects: 4 +- Range: -0.0293 to -0.00305 +- CV: 0.668 +- Sign consistent: True + +## Caveats + +- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. +- **INFO** — The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes. + +## Next Steps + +- Define target parameter + - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers. +- State identification assumptions + - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions. +- Compare with alternative estimators (SA, BJS, or Gardner) + - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity. +- Report with and without covariates + - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification. + +## References + +- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics. +- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. +- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. + + +## Technical Appendix + +``` +===================================================================================== + Callaway-Sant'Anna Staggered Difference-in-Differences Results +===================================================================================== + +Total observations: 2500 +Treated units: 309 +Never-treated units: 191 +Treatment cohorts: 3 +Time periods: 5 +Control group: never_treated +Base period: universal + +------------------------------------------------------------------------------------- + Overall Average Treatment Effect on the Treated +------------------------------------------------------------------------------------- +Parameter Estimate Std. Err. t-stat P>|t| Sig. +------------------------------------------------------------------------------------- +ATT -0.0214 0.0019 -11.397 0.0000 *** +------------------------------------------------------------------------------------- + +95% Confidence Interval: [-0.0251, -0.0178] +CV (SE/|ATT|): 0.0877 + +------------------------------------------------------------------------------------- + Event Study (Dynamic) Effects +------------------------------------------------------------------------------------- +Rel. Period Estimate Std. Err. t-stat P>|t| Sig. +------------------------------------------------------------------------------------- +-4 0.0023 0.0036 0.627 0.5309 +-3 -0.0019 0.0023 -0.810 0.4179 +-2 -0.0020 0.0022 -0.875 0.3818 +-1 0.0000 nan nan nan +0 -0.0293 0.0019 -15.137 0.0000 *** +1 -0.0235 0.0023 -10.111 0.0000 *** +2 -0.0134 0.0031 -4.373 0.0000 *** +3 -0.0031 0.0035 -0.884 0.3767 +------------------------------------------------------------------------------------- + +Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 +===================================================================================== +``` +``` +### BusinessReport.to_dict() - headline + assumption + caveats +```json +{ + "effect": -0.021448663176265446, + "se": 0.0018820025192546833, + "ci_lower": -0.025137320332818274, + "ci_upper": -0.01776000601971262, + "alpha_was_honored": true, + "alpha_override_caveat": null, + "ci_level": 95, + "p_value": 4.341504320370796e-30, + "is_significant": true, + "near_significance_threshold": false, + "unit": "log_points", + "unit_kind": "log_points", + "sign": "negative", + "breakdown_M": 1.2776496410369873 +} +``` +```json +{ + "parallel_trends_variant": "conditional_or_group_time", + "no_anticipation": true, + "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation." +} +``` +```json +{ + "status": "computed", + "method": "joint_wald_event_study", + "joint_p_value": 0.4816505473216015, + "verdict": "no_detected_violation", + "n_pre_periods": 3, + "n_dropped_undefined": null, + "reason": null, + "df_denom": null, + "power_status": "ran", + "power_reason": null, + "power_tier": "moderately_powered", + "mdv": 0.010472079171705551, + "mdv_share_of_att": 0.48823924762330606, + "power_covariance_source": "diag_fallback_available_full_vcov_unused" +} +``` +```json +{ + "status": "computed", + "method": "relative_magnitude", + "breakdown_M": 1.2776496410369873, + "conclusion": "robust_to_M_1.28", + "grid": [ + { + "M": 0.5, + "ci_lower": -0.026608074507223883, + "ci_upper": -0.008013136465533054, + "bound_lower": -0.022462755203290868, + "bound_upper": -0.01215845576946607, + "robust_to_zero": true + }, + { + "M": 1.0, + "ci_lower": -0.03176022422413628, + "ci_upper": -0.0028609867486206544, + "bound_lower": -0.027614904920203267, + "bound_upper": -0.00700630605255367, + "robust_to_zero": true + }, + { + "M": 1.5, + "ci_lower": -0.03691237394104868, + "ci_upper": 0.002291162968291743, + "bound_lower": -0.03276705463711566, + "bound_upper": -0.0018541563356412726, + "robust_to_zero": false + }, + { + "M": 2.0, + "ci_lower": -0.04206452365796108, + "ci_upper": 0.007443312685204144, + "bound_lower": -0.03791920435402807, + "bound_upper": 0.0032979933812711283, + "robust_to_zero": false + } + ] +} +``` +```json +[ + { + "severity": "warning", + "topic": "bacon_contamination", + "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity." + }, + { + "severity": "info", + "topic": "unit_policy", + "message": "The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes." + } +] +``` + +--- +## Cheng & Hoekstra (2013): Castle Doctrine laws +Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. Outcome: homicide rate per 100k population. + +Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in homicide rates in states that adopted Castle Doctrine (no deterrent effect; if anything, an escalation). + +### BusinessReport.summary() +``` +Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.00347); the headline should be treated as tentative pending the sensitivity analysis below. HonestDiD: the result is fragile — the confidence interval includes zero even at the smallest parallel-trends violations on the sensitivity grid. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. +``` +### BusinessReport.full_report() +```markdown +# Business Report: Homicide rate (per 100k) + +**Question**: Did Castle Doctrine law adoption change state homicide rates? + +**Estimator**: `CallawaySantAnnaResults` + +## Headline + +Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population). + +Statistically, the direction of the effect is strongly supported by the data. + +## Identifying Assumption + +Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation. + +## Pre-Trends + +- Verdict: `clear_violation` (joint p = 0.00347) +- Power tier: `underpowered` +- Minimum detectable violation (MDV): 0.732 +- MDV / |ATT|: 1.3 + +## Sensitivity (HonestDiD) + +- Method: `relative_magnitude` +- Breakdown M: 0 +- Conclusion: `fragile` + +## Sample + +- Observations: 539 +- Treated: 22 +- Control: 27 + +## Heterogeneity + +- Source: `event_study_effects_post` +- N effects: 6 +- Range: 0.237 to 0.764 +- CV: 0.348 +- Sign consistent: True + +## Caveats + +- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. +- **WARNING** — HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative. + +## Next Steps + +- Define target parameter + - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers. +- State identification assumptions + - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions. +- Compare with alternative estimators (SA, BJS, or Gardner) + - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity. +- Report with and without covariates + - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification. + +## References + +- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics. +- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. +- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. + + +## Technical Appendix + +``` +===================================================================================== + Callaway-Sant'Anna Staggered Difference-in-Differences Results +===================================================================================== + +Total observations: 539 +Treated units: 22 +Never-treated units: 27 +Treatment cohorts: 6 +Time periods: 11 +Control group: never_treated +Base period: universal + +------------------------------------------------------------------------------------- + Overall Average Treatment Effect on the Treated +------------------------------------------------------------------------------------- +Parameter Estimate Std. Err. t-stat P>|t| Sig. +------------------------------------------------------------------------------------- +ATT 0.5608 0.1216 4.613 0.0000 *** +------------------------------------------------------------------------------------- + +95% Confidence Interval: [0.3225, 0.7991] +CV (SE/|ATT|): 0.2168 + +------------------------------------------------------------------------------------- + Event Study (Dynamic) Effects +------------------------------------------------------------------------------------- +Rel. Period Estimate Std. Err. t-stat P>|t| Sig. +------------------------------------------------------------------------------------- +-10 -0.3415 0.1462 -2.336 0.0195 * +-9 0.3406 0.5526 0.616 0.5377 +-8 -0.1465 0.1794 -0.816 0.4143 +-7 0.1393 0.3426 0.406 0.6844 +-6 0.2611 0.1574 1.659 0.0972 . +-5 -0.0466 0.1215 -0.383 0.7015 +-4 0.1224 0.1511 0.810 0.4180 +-3 0.0783 0.1505 0.520 0.6030 +-2 0.1541 0.1085 1.420 0.1555 +-1 0.0000 nan nan nan +0 0.4453 0.1606 2.772 0.0056 ** +1 0.7074 0.1957 3.614 0.0003 *** +2 0.7642 0.1590 4.807 0.0000 *** +3 0.5525 0.1582 3.492 0.0005 *** +4 0.2367 0.1789 1.323 0.1859 +5 0.6463 0.1227 5.269 0.0000 *** +------------------------------------------------------------------------------------- + +Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 +===================================================================================== +``` +``` +### BusinessReport.to_dict() - headline + assumption + caveats +```json +{ + "effect": 0.5608256172839505, + "se": 0.12157293428086259, + "ci_lower": 0.32254704459860495, + "ci_upper": 0.7991041899692961, + "alpha_was_honored": true, + "alpha_override_caveat": null, + "ci_level": 95, + "p_value": 3.967463629059167e-06, + "is_significant": true, + "near_significance_threshold": false, + "unit": "per 100k population", + "unit_kind": "unknown", + "sign": "positive", + "breakdown_M": 0.0 +} +``` +```json +{ + "parallel_trends_variant": "conditional_or_group_time", + "no_anticipation": true, + "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation." +} +``` +```json +{ + "status": "computed", + "method": "joint_wald_event_study", + "joint_p_value": 0.003469090217576798, + "verdict": "clear_violation", + "n_pre_periods": 9, + "n_dropped_undefined": null, + "reason": null, + "df_denom": null, + "power_status": "ran", + "power_reason": null, + "power_tier": "underpowered", + "mdv": 0.7318611799799601, + "mdv_share_of_att": 1.3049710238350487, + "power_covariance_source": "diag_fallback_available_full_vcov_unused" +} +``` +```json +{ + "status": "computed", + "method": "relative_magnitude", + "breakdown_M": 0.0, + "conclusion": "fragile", + "grid": [ + { + "M": 0.5, + "ci_lower": -0.84457211437162, + "ci_upper": 1.9620045961559538, + "bound_lower": -0.6348485739226502, + "bound_upper": 1.752281055706984, + "robust_to_zero": false + }, + { + "M": 1.0, + "ci_lower": -2.038136929186437, + "ci_upper": 3.1555694109707706, + "bound_lower": -1.8284133887374672, + "bound_upper": 2.9458458705218007, + "robust_to_zero": false + }, + { + "M": 1.5, + "ci_lower": -3.231701744001254, + "ci_upper": 4.349134225785587, + "bound_lower": -3.021978203552284, + "bound_upper": 4.1394106853366175, + "robust_to_zero": false + }, + { + "M": 2.0, + "ci_lower": -4.4252665588160705, + "ci_upper": 5.542699040600405, + "bound_lower": -4.215543018367101, + "bound_upper": 5.332975500151435, + "robust_to_zero": false + } + ] +} +``` +```json +[ + { + "severity": "warning", + "topic": "bacon_contamination", + "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity." + }, + { + "severity": "warning", + "topic": "sensitivity_fragility", + "message": "HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative." + } +] +``` + +--- +## Castle Doctrine under Sun-Abraham (2021) +Same dataset and research question; different estimator. Testing BR/DR cross-estimator narrative consistency. + +### BusinessReport.summary() +``` +Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.0128); the headline should be treated as tentative. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. +``` +### BusinessReport.full_report() +```markdown +# Business Report: Homicide rate (per 100k) + +**Question**: Did Castle Doctrine law adoption change state homicide rates? + +**Estimator**: `SunAbrahamResults` + +## Headline + +Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population). + +Statistically, the direction of the effect is strongly supported by the data. + +## Identifying Assumption + +Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation. + +## Pre-Trends + +- Verdict: `clear_violation` (joint p = 0.0128) +- Power tier: `moderately_powered` +- Minimum detectable violation (MDV): 0.551 +- MDV / |ATT|: 0.98 + +## Sensitivity (HonestDiD) + +- Sensitivity not computed: sensitivity is not applicable to SunAbrahamResults. + +## Sample + +- Observations: 539 +- Treated: 22 +- Control: 27 + +## Heterogeneity + +- Source: `event_study_effects_post` +- N effects: 6 +- Range: 0.237 to 0.764 +- CV: 0.348 +- Sign consistent: True + +## Caveats + +- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. + +## Next Steps + +- Define target parameter + - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers. +- State identification assumptions + - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions. +- Specification-based falsification + - _why_: Compare results across control group definitions (never_treated vs not_yet_treated) and anticipation settings to assess robustness. +- Compare with alternative estimators (CS, BJS, or Gardner) + - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity. +- Report with and without covariates + - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification. + +## References + +- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies. Journal of Econometrics. +- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. +- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. + + +## Technical Appendix + +``` +===================================================================================== + Sun-Abraham Interaction-Weighted Estimator Results +===================================================================================== + +Total observations: 539 +Treated units: 22 +Control units: 27 +Treatment cohorts: 6 +Time periods: 11 +Control group: never_treated + +------------------------------------------------------------------------------------- + Overall Average Treatment Effect on the Treated +------------------------------------------------------------------------------------- +Parameter Estimate Std. Err. t-stat P>|t| Sig. +------------------------------------------------------------------------------------- +ATT 0.5608 0.1208 4.642 0.0000 *** +------------------------------------------------------------------------------------- + +95% Confidence Interval: [0.3240, 0.7976] +CV (SE/|ATT|): 0.2154 + +------------------------------------------------------------------------------------- + Event Study (Dynamic) Effects +------------------------------------------------------------------------------------- +Rel. Period Estimate Std. Err. t-stat P>|t| Sig. +------------------------------------------------------------------------------------- +-10 -0.3415 0.1566 -2.181 0.0292 * +-9 0.3406 0.1067 3.191 0.0014 ** +-8 -0.1465 0.1379 -1.062 0.2882 +-7 0.1393 0.3326 0.419 0.6754 +-6 0.2611 0.1646 1.586 0.1128 +-5 -0.0466 0.1181 -0.394 0.6933 +-4 0.1224 0.1344 0.911 0.3625 +-3 0.0783 0.1576 0.497 0.6194 +-2 0.1541 0.0957 1.610 0.1075 +0 0.4453 0.1627 2.737 0.0062 ** +1 0.7074 0.1903 3.716 0.0002 *** +2 0.7642 0.1612 4.739 0.0000 *** +3 0.5525 0.1646 3.357 0.0008 *** +4 0.2367 0.1909 1.240 0.2150 +5 0.6463 0.1313 4.921 0.0000 *** +------------------------------------------------------------------------------------- + +Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 +===================================================================================== +``` +``` +### BusinessReport.to_dict() - headline + assumption +```json +{ + "effect": 0.5608256172839505, + "se": 0.12081241968043965, + "ci_lower": 0.3240376258251508, + "ci_upper": 0.7976136087427503, + "alpha_was_honored": true, + "alpha_override_caveat": null, + "ci_level": 95, + "p_value": 3.448543002483855e-06, + "is_significant": true, + "near_significance_threshold": false, + "unit": "per 100k population", + "unit_kind": "unknown", + "sign": "positive", + "breakdown_M": null +} +``` +```json +{ + "parallel_trends_variant": "conditional_or_group_time", + "no_anticipation": true, + "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation." +} +``` + +--- diff --git a/docs/validation/validate_br_dr_canonical.py b/docs/validation/validate_br_dr_canonical.py new file mode 100644 index 00000000..cc411551 --- /dev/null +++ b/docs/validation/validate_br_dr_canonical.py @@ -0,0 +1,308 @@ +"""Run BusinessReport / DiagnosticReport on canonical DiD datasets. + +Writes ``docs/validation/br_dr_canonical_validation.md`` with the +full BR ``summary()`` + ``full_report()`` + selected ``to_dict()`` +blocks for each dataset. The markdown output is the reviewable +artifact; compare it against canonical literature interpretations +and record any divergences in +``docs/validation/br_dr_canonical_findings.md``. + +Purpose: BR/DR gap #4 (real-dataset validation) — synthetic-DGP +tests pass but we haven't checked whether the prose output matches +canonical interpretations of applied work. + +Run via: ``python docs/validation/validate_br_dr_canonical.py``. +""" + +from __future__ import annotations + +import json +import sys +import warnings +from pathlib import Path + +import numpy as np + +from diff_diff import ( + BusinessReport, + CallawaySantAnna, + DifferenceInDifferences, + SunAbraham, +) +from diff_diff.datasets import ( + load_card_krueger, + load_castle_doctrine, + load_mpdta, +) + +OUT_PATH = Path(__file__).parent / "br_dr_canonical_validation.md" + + +def _section(title: str, level: int = 2) -> str: + return "#" * level + " " + title + "\n" + + +def _fence(body: str, lang: str = "") -> str: + return f"```{lang}\n{body.rstrip()}\n```\n" + + +def _dump_block(name: str, block: dict) -> str: + return _fence(json.dumps(block, indent=2, default=str), "json") + + +def _card_krueger_section() -> str: + """Card & Krueger (1994) minimum wage — classic 2x2 DiD. + + Canonical finding: no significant negative effect of NJ minimum-wage + increase on fast-food employment; published ATT ~ +2.8 FTE or + approximately 0.6 FTE per store depending on specification. CI + includes zero; direction positive. + """ + parts = [_section("Card & Krueger (1994): NJ/PA minimum wage", 2)] + ck = load_card_krueger() + # Reshape wide -> long per the docstring example. + ck_long = ck.melt( + id_vars=["store_id", "state", "treated"], + value_vars=["emp_pre", "emp_post"], + var_name="period", + value_name="employment", + ) + ck_long["post"] = (ck_long["period"] == "emp_post").astype(int) + did = DifferenceInDifferences() + fit = did.fit(ck_long, outcome="employment", treatment="treated", time="post") + br = BusinessReport( + fit, + outcome_label="FTE employment", + outcome_unit="FTE", + outcome_direction="higher_is_better", + business_question="Did the NJ minimum-wage increase reduce fast-food employment?", + treatment_label="the NJ minimum-wage increase", + auto_diagnostics=False, # 2x2 PT needs manual column kwargs; run without for now + ) + parts.append( + "Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA " + "(control, $4.25 throughout). Outcome: full-time equivalent employment. " + f"N={len(ck)} stores.\n\n" + ) + parts.append( + "Canonical interpretation: no significant disemployment effect of the " + "minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). " + "The famous finding was that the CI included zero.\n\n" + ) + parts.append(_section("BusinessReport.summary()", 3)) + parts.append(_fence(br.summary())) + parts.append(_section("BusinessReport.full_report()", 3)) + parts.append(_fence(br.full_report(), "markdown")) + parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3)) + d = br.to_dict() + parts.append(_dump_block("headline", d.get("headline", {}))) + parts.append(_dump_block("assumption", d.get("assumption", {}))) + parts.append(_dump_block("caveats", d.get("caveats", []))) + parts.append("\n---\n") + return "".join(parts) + + +def _mpdta_section() -> str: + """Callaway-Sant'Anna benchmark (mpdta): county-level log employment + under staggered minimum-wage increases. + + Canonical finding: CS aggregate ATT roughly -0.04 to -0.05 on log + employment (i.e., ~4-5% employment decline for treated counties). + Group-level ATT(g,t) shown in CS Figure 1. + """ + parts = [_section("Callaway-Sant'Anna benchmark (mpdta)", 2)] + df = load_mpdta() + cs = CallawaySantAnna(base_period="universal") + fit = cs.fit( + df, + outcome="lemp", + unit="countyreal", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + fit, + outcome_label="Log employment", + outcome_unit="log_points", + outcome_direction="higher_is_better", + business_question="Did minimum-wage increases reduce county employment?", + treatment_label="the state-level minimum wage increase", + data=df, + outcome="lemp", + unit="countyreal", + time="year", + first_treat="first_treat", + ) + parts.append( + "Data: simulated county-level panel from R `did` package (Callaway & " + "Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: " + "log employment (`lemp`).\n\n" + ) + parts.append( + "Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) " + "on treated counties; group-specific ATT(g,t) negative across cohorts. " + "See CS (2021) Figures 1-2.\n\n" + ) + parts.append(_section("BusinessReport.summary()", 3)) + parts.append(_fence(br.summary())) + parts.append(_section("BusinessReport.full_report()", 3)) + parts.append(_fence(br.full_report(), "markdown")) + parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3)) + d = br.to_dict() + parts.append(_dump_block("headline", d.get("headline", {}))) + parts.append(_dump_block("assumption", d.get("assumption", {}))) + parts.append(_dump_block("pre_trends", d.get("pre_trends", {}))) + parts.append(_dump_block("sensitivity", d.get("sensitivity", {}))) + parts.append(_dump_block("caveats", d.get("caveats", []))) + parts.append("\n---\n") + return "".join(parts) + + +def _castle_doctrine_section() -> str: + """Cheng & Hoekstra (2013): staggered adoption of Castle Doctrine laws. + + Canonical finding: ~8% increase in homicide rates in adopting + states; no deterrent effect on burglary or other crimes. + """ + parts = [_section("Cheng & Hoekstra (2013): Castle Doctrine laws", 2)] + df = load_castle_doctrine() + # CS with never-treated as control; outcome = homicide rate. + cs = CallawaySantAnna(base_period="universal", control_group="never_treated") + fit = cs.fit( + df, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + fit, + outcome_label="Homicide rate (per 100k)", + outcome_unit="per 100k population", + outcome_direction="lower_is_better", + business_question=( + "Did Castle Doctrine law adoption change state homicide rates?" + ), + treatment_label="Castle Doctrine law adoption", + data=df, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + parts.append( + "Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. " + "Outcome: homicide rate per 100k population.\n\n" + ) + parts.append( + "Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in " + "homicide rates in states that adopted Castle Doctrine (no deterrent " + "effect; if anything, an escalation).\n\n" + ) + parts.append(_section("BusinessReport.summary()", 3)) + parts.append(_fence(br.summary())) + parts.append(_section("BusinessReport.full_report()", 3)) + parts.append(_fence(br.full_report(), "markdown")) + parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3)) + d = br.to_dict() + parts.append(_dump_block("headline", d.get("headline", {}))) + parts.append(_dump_block("assumption", d.get("assumption", {}))) + parts.append(_dump_block("pre_trends", d.get("pre_trends", {}))) + parts.append(_dump_block("sensitivity", d.get("sensitivity", {}))) + parts.append(_dump_block("caveats", d.get("caveats", []))) + parts.append("\n---\n") + return "".join(parts) + + +def _castle_doctrine_sun_abraham_section() -> str: + """Same Castle Doctrine dataset but run through Sun-Abraham, as a + cross-estimator consistency check. If SA and CS narrate the same + canonical finding differently, that's a BR/DR source-faithfulness + issue. + """ + parts = [_section("Castle Doctrine under Sun-Abraham (2021)", 2)] + df = load_castle_doctrine() + sa = SunAbraham() + fit = sa.fit( + df, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + br = BusinessReport( + fit, + outcome_label="Homicide rate (per 100k)", + outcome_unit="per 100k population", + outcome_direction="lower_is_better", + business_question=( + "Did Castle Doctrine law adoption change state homicide rates?" + ), + treatment_label="Castle Doctrine law adoption", + data=df, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + parts.append( + "Same dataset and research question; different estimator. Testing BR/DR " + "cross-estimator narrative consistency.\n\n" + ) + parts.append(_section("BusinessReport.summary()", 3)) + parts.append(_fence(br.summary())) + parts.append(_section("BusinessReport.full_report()", 3)) + parts.append(_fence(br.full_report(), "markdown")) + parts.append(_section("BusinessReport.to_dict() - headline + assumption", 3)) + d = br.to_dict() + parts.append(_dump_block("headline", d.get("headline", {}))) + parts.append(_dump_block("assumption", d.get("assumption", {}))) + parts.append("\n---\n") + return "".join(parts) + + +def main() -> int: + warnings.filterwarnings("ignore") + np.random.seed(42) + + header = ( + "# BR / DR canonical-dataset validation\n\n" + "Output of ``docs/validation/validate_br_dr_canonical.py``. Each section " + "runs BusinessReport (and its auto-constructed DiagnosticReport) on a " + "canonical DiD dataset and dumps summary + full_report + selected " + "to_dict blocks. The purpose is to compare BR's prose output against " + "published canonical interpretations and record divergences in " + "``br_dr_canonical_findings.md``.\n\n" + "This file is regenerable; do not hand-edit.\n\n" + "Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 " + "benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA).\n\n" + "---\n\n" + ) + + sections = [header] + for name, fn in ( + ("card_krueger", _card_krueger_section), + ("mpdta", _mpdta_section), + ("castle_doctrine_cs", _castle_doctrine_section), + ("castle_doctrine_sa", _castle_doctrine_sun_abraham_section), + ): + print(f"Running {name} ...", file=sys.stderr) + try: + sections.append(fn()) + except Exception as exc: # noqa: BLE001 + sections.append( + _section(f"{name} (ERROR)", 2) + + _fence(f"{type(exc).__name__}: {exc}") + + "\n---\n" + ) + print(f" {type(exc).__name__}: {exc}", file=sys.stderr) + + OUT_PATH.write_text("".join(sections)) + print(f"Wrote {OUT_PATH}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_business_report.py b/tests/test_business_report.py index cda3e5a7..77e059e3 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -3975,6 +3975,174 @@ def test_br_rejects_both_passthrough_inputs_names_them(self): assert "precomputed['sensitivity']" in msg +class TestCanonicalValidationSurfaceFixes: + """Regression coverage for issues surfaced by the first round of + BR/DR canonical-dataset validation (``docs/validation/ + validate_br_dr_canonical.py``). Each test pins a wording bug + observed on a real published-applied-work fit. + """ + + def _cs_like_stub_with_zero_breakdown(self): + """CS-style result stub matching the Cheng-Hoekstra Castle + Doctrine fit pattern.""" + + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 0.5608 + stub.overall_se = 0.1216 + stub.overall_p_value = 0.0 + stub.overall_conf_int = (0.323, 0.799) + stub.alpha = 0.05 + stub.n_obs = 539 + stub.n_treated = 22 + stub.n_control_units = 27 + stub.survey_metadata = None + stub.event_study_effects = None + stub.base_period = "universal" + stub.inference_method = "analytical" + return stub + + def _fragile_dr_schema(self, breakdown_m: float): + """Build a fake DiagnosticReportResults whose ``sensitivity`` + block carries the given ``breakdown_M`` value.""" + from diff_diff.diagnostic_report import DiagnosticReportResults + + schema = { + "schema_version": "1.0", + "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"}, + "headline_metric": {}, + "parallel_trends": {"status": "skipped", "reason": "stub"}, + "pretrends_power": {"status": "skipped", "reason": "stub"}, + "sensitivity": { + "status": "ran", + "method": "relative_magnitude", + "breakdown_M": breakdown_m, + "conclusion": "fragile", + "grid": [], + }, + "placebo": {"status": "skipped", "reason": "stub"}, + "bacon": {"status": "skipped", "reason": "stub"}, + "design_effect": {"status": "skipped", "reason": "stub"}, + "heterogeneity": {"status": "skipped", "reason": "stub"}, + "epv": {"status": "skipped", "reason": "stub"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "overall_interpretation": "", + "next_steps": [], + } + return DiagnosticReportResults( + schema=schema, + interpretation="", + applicable_checks=("sensitivity",), + skipped_checks={}, + warnings=(), + ) + + def test_treatment_label_preserves_embedded_abbreviations(self): + """Card-Krueger use case: ``treatment_label="the NJ minimum-wage + increase"`` previously rendered as ``"The nj minimum-wage + increase"`` because ``str.capitalize()`` lowercases every + character after the first. The fix preserves user-supplied + casing and only uppercases the first character. + """ + + class DiDResults: + pass + + stub = DiDResults() + stub.att = 1.47 + stub.se = 1.93 + stub.t_stat = 0.76 + stub.p_value = 0.45 + stub.conf_int = (-2.32, 5.27) + stub.alpha = 0.05 + stub.n_obs = 620 + stub.n_treated = 462 + stub.n_control = 158 + stub.survey_metadata = None + stub.inference_method = "analytical" + br = BusinessReport( + stub, + outcome_label="FTE employment", + treatment_label="the NJ minimum-wage increase", + auto_diagnostics=False, + ) + headline = br.headline() + assert "The NJ minimum-wage increase" in headline, ( + "Embedded ``NJ`` abbreviation must survive the first-word " + f"capitalization. Got headline: {headline!r}" + ) + assert "The nj" not in headline, ( + "Previous capitalize() bug lowercased the NJ abbreviation. " f"Got: {headline!r}" + ) + + def test_treatment_label_preserves_proper_noun_case(self): + """Castle Doctrine use case: ``treatment_label="Castle Doctrine + law adoption"`` previously rendered as ``"Castle doctrine law + adoption"`` because capitalize() lowercased the rest. Must + preserve proper-noun casing. + """ + stub = self._cs_like_stub_with_zero_breakdown() + br = BusinessReport( + stub, + outcome_label="Homicide rate", + treatment_label="Castle Doctrine law adoption", + auto_diagnostics=False, + ) + headline = br.headline() + assert ( + "Castle Doctrine law adoption" in headline + ), f"Proper-noun casing must be preserved. Got: {headline!r}" + + def test_breakdown_m_zero_uses_smallest_grid_point_wording(self): + """Cheng-Hoekstra Castle Doctrine produces ``breakdown_M == 0`` + under HonestDiD. The old wording "violations reach 0x the + pre-period variation" reads as a degenerate zero-times-variation + sentence. The fix switches to "includes zero even at the + smallest parallel-trends violations on the sensitivity grid" + for breakdown values at or near zero. + """ + stub = self._cs_like_stub_with_zero_breakdown() + dr = self._fragile_dr_schema(breakdown_m=0.0) + br = BusinessReport(stub, diagnostics=dr) + summary = br.summary() + assert "0x" not in summary, ( + f"Summary must not render ``0x the pre-period variation``; " + f"that reads as zero-times-anything. Got: {summary!r}" + ) + assert "smallest parallel-trends violations" in summary, ( + f"Summary must use the smallest-grid-point wording at " + f"breakdown_M == 0. Got: {summary!r}" + ) + + def test_breakdown_m_small_positive_still_uses_smallest_grid_point_wording(self): + """Breakdown values just above zero (e.g., 0.03) should also + route through the smallest-grid-point wording — quoting + ``0.03x`` to a stakeholder is equally uninformative. + """ + stub = self._cs_like_stub_with_zero_breakdown() + dr = self._fragile_dr_schema(breakdown_m=0.03) + br = BusinessReport(stub, diagnostics=dr) + summary = br.summary() + assert "smallest parallel-trends violations" in summary + assert "0.03x" not in summary + + def test_breakdown_m_normal_keeps_multiplier_wording(self): + """Breakdown values at the usual fragile-but-nonzero range + (e.g., 0.3) must still quote the ``0.3x`` multiplier — the + smallest-grid-point wording is only for the degenerate tail. + """ + stub = self._cs_like_stub_with_zero_breakdown() + dr = self._fragile_dr_schema(breakdown_m=0.3) + br = BusinessReport(stub, diagnostics=dr) + summary = br.summary() + assert "0.3x" in summary + assert "smallest parallel-trends violations" not in summary + + class TestBaconCaveatEstimatorAware: """Round-45 P1 CI review on PR #318: Goodman-Bacon decomposes TWFE weights. On fits already produced by a heterogeneity-robust From 1818503fe18fa5be9a61122dafab65cb910f90b8 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 19:45:19 -0400 Subject: [PATCH 2/3] Restructure canonical validation: replace one-shot script with regression tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: the validation script + findings doc were one-shot artifacts that would age poorly. Replace them with ``tests/test_br_dr_canonical_datasets.py`` — pytest regression guards that assert canonical properties (direction, PT verdict tier, HonestDiD breakdown_M tier, cross-estimator consistency) on each canonical fit. Uses the ``_construct_*`` fallback data from ``diff_diff.datasets`` so tests have no network dependency (same pattern ``test_datasets.py`` already uses). Tests cover: - Card-Krueger (1994): positive sign, CI includes zero, "consistent with no effect" prose. - mpdta (CS 2021 benchmark): negative ATT, breakdown_M > 1.0, no_detected_violation pre-trends. - Castle Doctrine (Cheng-Hoekstra 2013) under CS: positive ATT, clear_violation pre-trends, fragile sensitivity (breakdown_M < 0.5). - Castle Doctrine cross-estimator consistency: SA agrees with CS on direction and PT verdict bin. - Treatment-label capitalization bugs: ``NJ`` abbreviation and ``Castle Doctrine`` proper noun preserved through BR's sentence capitalization. - ``breakdown_M == 0`` edge case: BR summary uses smallest-grid-point wording, not the degenerate ``0x`` multiplier. Drops: - ``docs/validation/validate_br_dr_canonical.py`` — one-shot script, replaced by the regression tests. - ``docs/validation/br_dr_canonical_validation.md`` — raw dump, regenerable on demand if needed but not checked in. - ``docs/validation/br_dr_canonical_findings.md`` — summary now lives in the regression-test docstrings. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/validation/br_dr_canonical_findings.md | 175 ----- docs/validation/br_dr_canonical_validation.md | 723 ------------------ docs/validation/validate_br_dr_canonical.py | 308 -------- tests/test_br_dr_canonical_datasets.py | 356 +++++++++ 4 files changed, 356 insertions(+), 1206 deletions(-) delete mode 100644 docs/validation/br_dr_canonical_findings.md delete mode 100644 docs/validation/br_dr_canonical_validation.md delete mode 100644 docs/validation/validate_br_dr_canonical.py create mode 100644 tests/test_br_dr_canonical_datasets.py diff --git a/docs/validation/br_dr_canonical_findings.md b/docs/validation/br_dr_canonical_findings.md deleted file mode 100644 index 7f369573..00000000 --- a/docs/validation/br_dr_canonical_findings.md +++ /dev/null @@ -1,175 +0,0 @@ -# BR / DR canonical-dataset validation findings - -This file records divergences observed in -``br_dr_canonical_validation.md`` against canonical literature -interpretations. Generated by running -``docs/validation/validate_br_dr_canonical.py`` on the bundled -datasets (Card-Krueger 1994, Callaway-Sant'Anna mpdta benchmark, -Castle Doctrine / Cheng-Hoekstra 2013). This closes BR/DR -foundation gap #4 — real-dataset validation — from the -external-positioning gap list in -``project_br_dr_foundation.md``. - -The goal of the validation exercise is to stress-test BR's prose on -fits that published applied work has already interpreted, not to -exactly reproduce their point estimates (the bundled datasets are -either the R `did` package simulated benchmark or the causaldata -mirrors, which may differ from the original author data). - -## Headline assessment - -BR's prose direction, verdicts, and caveat framing match canonical -interpretations across all four runs: - -- **Card-Krueger**: positive sign, CI includes zero, "data consistent - with no effect." Matches the famous Card-Krueger finding of no - disemployment. -- **mpdta (CS)**: aggregate ATT negative (-0.021 log-points), pre-trends - `no_detected_violation`, HonestDiD `robust_to_M_1.28`. Matches CS - tutorial expectations that the fit is robust. -- **Castle Doctrine (CS)**: positive sign (homicides went up), pre-trends - `clear_violation` (joint p = 0.003), HonestDiD `fragile` - (breakdown_M = 0). Matches Cheng-Hoekstra's escalation finding AND - correctly flags the identifying-assumption fragility the staggered - rollout produces. -- **Castle Doctrine (SA)**: identical point estimates (as expected — - CS and SA are algebraically consistent on this data), same clear PT - violation verdict. - -No wrong-sign or wrong-verdict findings surfaced on any of the four -runs. The Bacon "already-robust" framing lifted from round-45 reads -correctly on the staggered fits (CS and SA on Castle Doctrine and -mpdta): the caveat is scoped as a statement about the rollout -design, not a switch-estimator recommendation. - -## Issues fixed in this PR - -Small prose bugs surfaced by the real-data output. Each is a wording -fix, not a methodology defect. All three are regression-tested under -``tests/test_business_report.py::TestCanonicalValidationSurfaceFixes``. - -### Issue 1 (FIXED): Treatment label first-word capitalization eats abbreviations - -Card-Krueger output: - -> The **nj** minimum-wage increase lifted FTE employment by 1.47 FTE … - -Castle Doctrine output (CS and SA): - -> **Castle doctrine** law adoption worsened Homicide rate (per 100k) … - -BR used ``str.capitalize()``, which lowercases every character after -the first. For labels starting with an abbreviation (``"the NJ -minimum-wage increase"``) or a proper-noun phrase (``"Castle Doctrine -law adoption"``), this flattened case in a way that looked wrong in -stakeholder-facing prose. - -**Fix**: replaced ``str.capitalize()`` with a new -``_sentence_first_upper`` helper that uppercases only the first -character and preserves user-supplied casing for everything else. - -### Issue 2 (FIXED): ``breakdown_M = 0`` phrasing reads as "0x" (zero-times-something) - -Castle Doctrine (CS): - -> HonestDiD: the result is fragile — the confidence interval -> includes zero once violations reach **0x** the pre-period -> variation. - -When the breakdown M is exactly 0, "reach 0x the pre-period -variation" is a degenerate reading — the CI already includes zero -under any nonzero pre-trend violation (or even with zero violation, -depending on the grid). - -**Fix**: when ``breakdown_M <= 0.05``, both BR's summary and DR's -overall-interpretation sentence emit "the confidence interval -includes zero even at the smallest parallel-trends violations on -the sensitivity grid" instead of quoting the ``0x`` multiplier. The -0.05 threshold also covers near-zero values (e.g., 0.03) where the -multiplier is equally uninformative to stakeholders. - -### Issue 3 (deferred): Outcome-label capitalization in mid-sentence - -mpdta output: - -> … reduced **Log employment** by 0.0214 log-points … - -The user-supplied ``outcome_label="Log employment"`` is -capitalized as-is, which looks awkward mid-sentence. This is -stylistic, and arguably user-controllable (the user could pass -``outcome_label="log employment"``). Deprioritize unless fixing -Issue 1 is trivially extensible. Noted here for follow-up. - -## Issues to track as follow-up (out-of-scope for this PR) - -### Follow-up A: ``SunAbrahamResults`` excluded from HonestDiD applicability - -BR's applicability matrix (``diagnostic_report.py`` line ~107) lists -``SunAbrahamResults`` with ``{parallel_trends, pretrends_power, bacon, -design_effect, heterogeneity}`` — NO ``sensitivity``. But the -original plan's applicability matrix in -``project_br_dr_foundation.md`` and the SA methodology surface -(event-study coefficients + VCov) both support HonestDiD in principle. - -Observed on Castle Doctrine (SA): - -> ## Sensitivity (HonestDiD) -> -> - Sensitivity not computed: sensitivity is not applicable to -> SunAbrahamResults. - -Given SA shows the same PT violation that CS does on this dataset, -not having HonestDiD sensitivity on SA is a real usability gap. This -requires adding an SA adapter to ``compute_honest_did`` and expanding -the applicability matrix; it is library work beyond BR/DR prose. -Belongs in the BR/DR gap-list expansion. - -### Follow-up B: Target-parameter clarity (gap list item #6) - -The assumption block on every staggered fit still reads: - -> Identification relies on parallel trends across treatment cohorts -> and time periods (group-time ATT), plus no anticipation. - -But the CS ``overall_att`` for mpdta is a specific weighted average -of ``ATT(g, t)`` cells, SA is an IW average, Stacked is a -sub-experiment-weighted average, dCDH is a switchers average. BR's -headline reports a single number without disambiguating the -estimand. For Baker et al. (2025) practitioner-guide parity, the -assumption block should carry the target-parameter clause. -Already tracked as gap #6. - -### Follow-up C: Card-Krueger effect size differs from published ATT - -Our bundled ``load_card_krueger`` returns an ATT of +1.47 FTE; the -published Card-Krueger ATT is ~+0.59 FTE. The direction and -CI-includes-zero verdict match canonical, but the magnitude does -not. This is a ``datasets.py`` data-loading question (the -causaldata mirror may aggregate differently than the original -author sample), not a BR prose bug. Noted here so a future -data-validation PR can address it upstream. - -## What was validated (summary) - -- End-to-end BR / DR flow runs without errors on 4 canonical datasets. -- Direction of the effect matches canonical interpretation on all 4. -- Pre-trends verdict tier (no_detected_violation / clear_violation) - matches the literature's reading. -- HonestDiD sensitivity tier (robust vs fragile) matches. -- Bacon "already-robust" framing from round-45 reads correctly on - real staggered data. -- The identifying-assumption source-faithfulness retags from - round-42 (BJS / Gardner untreated-outcome FE model) did not - surface on these runs because none of the datasets was run - through ImputationDiD or TwoStageDiD — follow-up validation - should add those. - -## Regeneration - -```bash -python docs/validation/validate_br_dr_canonical.py -``` - -The script writes ``br_dr_canonical_validation.md`` (the raw output -artifact); this file is the findings synthesis and is written by -hand from that artifact. diff --git a/docs/validation/br_dr_canonical_validation.md b/docs/validation/br_dr_canonical_validation.md deleted file mode 100644 index 99bb5af0..00000000 --- a/docs/validation/br_dr_canonical_validation.md +++ /dev/null @@ -1,723 +0,0 @@ -# BR / DR canonical-dataset validation - -Output of ``docs/validation/validate_br_dr_canonical.py``. Each section runs BusinessReport (and its auto-constructed DiagnosticReport) on a canonical DiD dataset and dumps summary + full_report + selected to_dict blocks. The purpose is to compare BR's prose output against published canonical interpretations and record divergences in ``br_dr_canonical_findings.md``. - -This file is regenerable; do not hand-edit. - -Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA). - ---- - -## Card & Krueger (1994): NJ/PA minimum wage -Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA (control, $4.25 throughout). Outcome: full-time equivalent employment. N=310 stores. - -Canonical interpretation: no significant disemployment effect of the minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). The famous finding was that the CI included zero. - -### BusinessReport.summary() -``` -Question: Did the NJ minimum-wage increase reduce fast-food employment? The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE). Statistically, the confidence interval includes zero; the data are consistent with no effect. Sample: 620 observations (462 treated, 158 control). -``` -### BusinessReport.full_report() -```markdown -# Business Report: FTE employment - -**Question**: Did the NJ minimum-wage increase reduce fast-food employment? - -**Estimator**: `DiDResults` - -## Headline - -The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE). - -Statistically, the confidence interval includes zero; the data are consistent with no effect. - -## Identifying Assumption - -Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group. - -## Pre-Trends - -- Pre-trends not computed: auto_diagnostics=False - -## Sensitivity (HonestDiD) - -- Sensitivity not computed: auto_diagnostics=False - -## Sample - -- Observations: 620 -- Treated: 462 -- Control: 158 - -## References - -- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. -- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. - - -## Technical Appendix - -``` -====================================================================== - Difference-in-Differences Estimation Results -====================================================================== - -Observations: 620 -Treated: 462 -Control: 158 -R-squared: 0.0036 -Variance: HC1 heteroskedasticity-robust - ----------------------------------------------------------------------- -Parameter Estimate Std. Err. t-stat P>|t| ----------------------------------------------------------------------- -ATT 1.4718 1.9320 0.762 0.4465 ----------------------------------------------------------------------- - -95% Confidence Interval: [-2.3224, 5.2660] -CV (SE/|ATT|): 1.3127 - -Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 -====================================================================== -``` -``` -### BusinessReport.to_dict() - headline + assumption + caveats -```json -{ - "effect": 1.4718176338428604, - "se": 1.9320362599811534, - "ci_lower": -2.322358689575049, - "ci_upper": 5.26599395726077, - "alpha_was_honored": true, - "alpha_override_caveat": null, - "ci_level": 95, - "p_value": 0.4464732839915416, - "is_significant": false, - "near_significance_threshold": false, - "unit": "FTE", - "unit_kind": "unknown", - "sign": "positive", - "breakdown_M": null -} -``` -```json -{ - "parallel_trends_variant": "unconditional", - "no_anticipation": true, - "description": "Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group." -} -``` -```json -[] -``` - ---- -## Callaway-Sant'Anna benchmark (mpdta) -Data: simulated county-level panel from R `did` package (Callaway & Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: log employment (`lemp`). - -Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) on treated counties; group-specific ATT(g,t) negative across cohorts. See CS (2021) Figures 1-2. - -### BusinessReport.summary() -``` -Question: Did minimum-wage increases reduce county employment? The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients do not reject parallel trends; the test is moderately informative. See the sensitivity analysis below for bounded-violation guarantees. HonestDiD: the result remains significant under parallel-trends violations up to 1.3x the observed pre-period variation. Sample: 2,500 observations (309 treated, 191 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. -``` -### BusinessReport.full_report() -```markdown -# Business Report: Log employment - -**Question**: Did minimum-wage increases reduce county employment? - -**Estimator**: `CallawaySantAnnaResults` - -## Headline - -The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points). - -Statistically, the direction of the effect is strongly supported by the data. - -## Identifying Assumption - -Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation. - -## Pre-Trends - -- Verdict: `no_detected_violation` (joint p = 0.482) -- Power tier: `moderately_powered` -- Minimum detectable violation (MDV): 0.0105 -- MDV / |ATT|: 0.49 - -## Sensitivity (HonestDiD) - -- Method: `relative_magnitude` -- Breakdown M: 1.28 -- Conclusion: `robust_to_M_1.28` - -## Sample - -- Observations: 2,500 -- Treated: 309 -- Control: 191 - -## Heterogeneity - -- Source: `event_study_effects_post` -- N effects: 4 -- Range: -0.0293 to -0.00305 -- CV: 0.668 -- Sign consistent: True - -## Caveats - -- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. -- **INFO** — The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes. - -## Next Steps - -- Define target parameter - - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers. -- State identification assumptions - - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions. -- Compare with alternative estimators (SA, BJS, or Gardner) - - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity. -- Report with and without covariates - - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification. - -## References - -- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics. -- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. -- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. - - -## Technical Appendix - -``` -===================================================================================== - Callaway-Sant'Anna Staggered Difference-in-Differences Results -===================================================================================== - -Total observations: 2500 -Treated units: 309 -Never-treated units: 191 -Treatment cohorts: 3 -Time periods: 5 -Control group: never_treated -Base period: universal - -------------------------------------------------------------------------------------- - Overall Average Treatment Effect on the Treated -------------------------------------------------------------------------------------- -Parameter Estimate Std. Err. t-stat P>|t| Sig. -------------------------------------------------------------------------------------- -ATT -0.0214 0.0019 -11.397 0.0000 *** -------------------------------------------------------------------------------------- - -95% Confidence Interval: [-0.0251, -0.0178] -CV (SE/|ATT|): 0.0877 - -------------------------------------------------------------------------------------- - Event Study (Dynamic) Effects -------------------------------------------------------------------------------------- -Rel. Period Estimate Std. Err. t-stat P>|t| Sig. -------------------------------------------------------------------------------------- --4 0.0023 0.0036 0.627 0.5309 --3 -0.0019 0.0023 -0.810 0.4179 --2 -0.0020 0.0022 -0.875 0.3818 --1 0.0000 nan nan nan -0 -0.0293 0.0019 -15.137 0.0000 *** -1 -0.0235 0.0023 -10.111 0.0000 *** -2 -0.0134 0.0031 -4.373 0.0000 *** -3 -0.0031 0.0035 -0.884 0.3767 -------------------------------------------------------------------------------------- - -Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 -===================================================================================== -``` -``` -### BusinessReport.to_dict() - headline + assumption + caveats -```json -{ - "effect": -0.021448663176265446, - "se": 0.0018820025192546833, - "ci_lower": -0.025137320332818274, - "ci_upper": -0.01776000601971262, - "alpha_was_honored": true, - "alpha_override_caveat": null, - "ci_level": 95, - "p_value": 4.341504320370796e-30, - "is_significant": true, - "near_significance_threshold": false, - "unit": "log_points", - "unit_kind": "log_points", - "sign": "negative", - "breakdown_M": 1.2776496410369873 -} -``` -```json -{ - "parallel_trends_variant": "conditional_or_group_time", - "no_anticipation": true, - "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation." -} -``` -```json -{ - "status": "computed", - "method": "joint_wald_event_study", - "joint_p_value": 0.4816505473216015, - "verdict": "no_detected_violation", - "n_pre_periods": 3, - "n_dropped_undefined": null, - "reason": null, - "df_denom": null, - "power_status": "ran", - "power_reason": null, - "power_tier": "moderately_powered", - "mdv": 0.010472079171705551, - "mdv_share_of_att": 0.48823924762330606, - "power_covariance_source": "diag_fallback_available_full_vcov_unused" -} -``` -```json -{ - "status": "computed", - "method": "relative_magnitude", - "breakdown_M": 1.2776496410369873, - "conclusion": "robust_to_M_1.28", - "grid": [ - { - "M": 0.5, - "ci_lower": -0.026608074507223883, - "ci_upper": -0.008013136465533054, - "bound_lower": -0.022462755203290868, - "bound_upper": -0.01215845576946607, - "robust_to_zero": true - }, - { - "M": 1.0, - "ci_lower": -0.03176022422413628, - "ci_upper": -0.0028609867486206544, - "bound_lower": -0.027614904920203267, - "bound_upper": -0.00700630605255367, - "robust_to_zero": true - }, - { - "M": 1.5, - "ci_lower": -0.03691237394104868, - "ci_upper": 0.002291162968291743, - "bound_lower": -0.03276705463711566, - "bound_upper": -0.0018541563356412726, - "robust_to_zero": false - }, - { - "M": 2.0, - "ci_lower": -0.04206452365796108, - "ci_upper": 0.007443312685204144, - "bound_lower": -0.03791920435402807, - "bound_upper": 0.0032979933812711283, - "robust_to_zero": false - } - ] -} -``` -```json -[ - { - "severity": "warning", - "topic": "bacon_contamination", - "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity." - }, - { - "severity": "info", - "topic": "unit_policy", - "message": "The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes." - } -] -``` - ---- -## Cheng & Hoekstra (2013): Castle Doctrine laws -Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. Outcome: homicide rate per 100k population. - -Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in homicide rates in states that adopted Castle Doctrine (no deterrent effect; if anything, an escalation). - -### BusinessReport.summary() -``` -Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.00347); the headline should be treated as tentative pending the sensitivity analysis below. HonestDiD: the result is fragile — the confidence interval includes zero even at the smallest parallel-trends violations on the sensitivity grid. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. -``` -### BusinessReport.full_report() -```markdown -# Business Report: Homicide rate (per 100k) - -**Question**: Did Castle Doctrine law adoption change state homicide rates? - -**Estimator**: `CallawaySantAnnaResults` - -## Headline - -Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population). - -Statistically, the direction of the effect is strongly supported by the data. - -## Identifying Assumption - -Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation. - -## Pre-Trends - -- Verdict: `clear_violation` (joint p = 0.00347) -- Power tier: `underpowered` -- Minimum detectable violation (MDV): 0.732 -- MDV / |ATT|: 1.3 - -## Sensitivity (HonestDiD) - -- Method: `relative_magnitude` -- Breakdown M: 0 -- Conclusion: `fragile` - -## Sample - -- Observations: 539 -- Treated: 22 -- Control: 27 - -## Heterogeneity - -- Source: `event_study_effects_post` -- N effects: 6 -- Range: 0.237 to 0.764 -- CV: 0.348 -- Sign consistent: True - -## Caveats - -- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. -- **WARNING** — HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative. - -## Next Steps - -- Define target parameter - - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers. -- State identification assumptions - - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions. -- Compare with alternative estimators (SA, BJS, or Gardner) - - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity. -- Report with and without covariates - - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification. - -## References - -- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics. -- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. -- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. - - -## Technical Appendix - -``` -===================================================================================== - Callaway-Sant'Anna Staggered Difference-in-Differences Results -===================================================================================== - -Total observations: 539 -Treated units: 22 -Never-treated units: 27 -Treatment cohorts: 6 -Time periods: 11 -Control group: never_treated -Base period: universal - -------------------------------------------------------------------------------------- - Overall Average Treatment Effect on the Treated -------------------------------------------------------------------------------------- -Parameter Estimate Std. Err. t-stat P>|t| Sig. -------------------------------------------------------------------------------------- -ATT 0.5608 0.1216 4.613 0.0000 *** -------------------------------------------------------------------------------------- - -95% Confidence Interval: [0.3225, 0.7991] -CV (SE/|ATT|): 0.2168 - -------------------------------------------------------------------------------------- - Event Study (Dynamic) Effects -------------------------------------------------------------------------------------- -Rel. Period Estimate Std. Err. t-stat P>|t| Sig. -------------------------------------------------------------------------------------- --10 -0.3415 0.1462 -2.336 0.0195 * --9 0.3406 0.5526 0.616 0.5377 --8 -0.1465 0.1794 -0.816 0.4143 --7 0.1393 0.3426 0.406 0.6844 --6 0.2611 0.1574 1.659 0.0972 . --5 -0.0466 0.1215 -0.383 0.7015 --4 0.1224 0.1511 0.810 0.4180 --3 0.0783 0.1505 0.520 0.6030 --2 0.1541 0.1085 1.420 0.1555 --1 0.0000 nan nan nan -0 0.4453 0.1606 2.772 0.0056 ** -1 0.7074 0.1957 3.614 0.0003 *** -2 0.7642 0.1590 4.807 0.0000 *** -3 0.5525 0.1582 3.492 0.0005 *** -4 0.2367 0.1789 1.323 0.1859 -5 0.6463 0.1227 5.269 0.0000 *** -------------------------------------------------------------------------------------- - -Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 -===================================================================================== -``` -``` -### BusinessReport.to_dict() - headline + assumption + caveats -```json -{ - "effect": 0.5608256172839505, - "se": 0.12157293428086259, - "ci_lower": 0.32254704459860495, - "ci_upper": 0.7991041899692961, - "alpha_was_honored": true, - "alpha_override_caveat": null, - "ci_level": 95, - "p_value": 3.967463629059167e-06, - "is_significant": true, - "near_significance_threshold": false, - "unit": "per 100k population", - "unit_kind": "unknown", - "sign": "positive", - "breakdown_M": 0.0 -} -``` -```json -{ - "parallel_trends_variant": "conditional_or_group_time", - "no_anticipation": true, - "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation." -} -``` -```json -{ - "status": "computed", - "method": "joint_wald_event_study", - "joint_p_value": 0.003469090217576798, - "verdict": "clear_violation", - "n_pre_periods": 9, - "n_dropped_undefined": null, - "reason": null, - "df_denom": null, - "power_status": "ran", - "power_reason": null, - "power_tier": "underpowered", - "mdv": 0.7318611799799601, - "mdv_share_of_att": 1.3049710238350487, - "power_covariance_source": "diag_fallback_available_full_vcov_unused" -} -``` -```json -{ - "status": "computed", - "method": "relative_magnitude", - "breakdown_M": 0.0, - "conclusion": "fragile", - "grid": [ - { - "M": 0.5, - "ci_lower": -0.84457211437162, - "ci_upper": 1.9620045961559538, - "bound_lower": -0.6348485739226502, - "bound_upper": 1.752281055706984, - "robust_to_zero": false - }, - { - "M": 1.0, - "ci_lower": -2.038136929186437, - "ci_upper": 3.1555694109707706, - "bound_lower": -1.8284133887374672, - "bound_upper": 2.9458458705218007, - "robust_to_zero": false - }, - { - "M": 1.5, - "ci_lower": -3.231701744001254, - "ci_upper": 4.349134225785587, - "bound_lower": -3.021978203552284, - "bound_upper": 4.1394106853366175, - "robust_to_zero": false - }, - { - "M": 2.0, - "ci_lower": -4.4252665588160705, - "ci_upper": 5.542699040600405, - "bound_lower": -4.215543018367101, - "bound_upper": 5.332975500151435, - "robust_to_zero": false - } - ] -} -``` -```json -[ - { - "severity": "warning", - "topic": "bacon_contamination", - "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity." - }, - { - "severity": "warning", - "topic": "sensitivity_fragility", - "message": "HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative." - } -] -``` - ---- -## Castle Doctrine under Sun-Abraham (2021) -Same dataset and research question; different estimator. Testing BR/DR cross-estimator narrative consistency. - -### BusinessReport.summary() -``` -Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.0128); the headline should be treated as tentative. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. -``` -### BusinessReport.full_report() -```markdown -# Business Report: Homicide rate (per 100k) - -**Question**: Did Castle Doctrine law adoption change state homicide rates? - -**Estimator**: `SunAbrahamResults` - -## Headline - -Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population). - -Statistically, the direction of the effect is strongly supported by the data. - -## Identifying Assumption - -Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation. - -## Pre-Trends - -- Verdict: `clear_violation` (joint p = 0.0128) -- Power tier: `moderately_powered` -- Minimum detectable violation (MDV): 0.551 -- MDV / |ATT|: 0.98 - -## Sensitivity (HonestDiD) - -- Sensitivity not computed: sensitivity is not applicable to SunAbrahamResults. - -## Sample - -- Observations: 539 -- Treated: 22 -- Control: 27 - -## Heterogeneity - -- Source: `event_study_effects_post` -- N effects: 6 -- Range: 0.237 to 0.764 -- CV: 0.348 -- Sign consistent: True - -## Caveats - -- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity. - -## Next Steps - -- Define target parameter - - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers. -- State identification assumptions - - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions. -- Specification-based falsification - - _why_: Compare results across control group definitions (never_treated vs not_yet_treated) and anticipation settings to assess robustness. -- Compare with alternative estimators (CS, BJS, or Gardner) - - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity. -- Report with and without covariates - - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification. - -## References - -- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies. Journal of Econometrics. -- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies. -- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide. - - -## Technical Appendix - -``` -===================================================================================== - Sun-Abraham Interaction-Weighted Estimator Results -===================================================================================== - -Total observations: 539 -Treated units: 22 -Control units: 27 -Treatment cohorts: 6 -Time periods: 11 -Control group: never_treated - -------------------------------------------------------------------------------------- - Overall Average Treatment Effect on the Treated -------------------------------------------------------------------------------------- -Parameter Estimate Std. Err. t-stat P>|t| Sig. -------------------------------------------------------------------------------------- -ATT 0.5608 0.1208 4.642 0.0000 *** -------------------------------------------------------------------------------------- - -95% Confidence Interval: [0.3240, 0.7976] -CV (SE/|ATT|): 0.2154 - -------------------------------------------------------------------------------------- - Event Study (Dynamic) Effects -------------------------------------------------------------------------------------- -Rel. Period Estimate Std. Err. t-stat P>|t| Sig. -------------------------------------------------------------------------------------- --10 -0.3415 0.1566 -2.181 0.0292 * --9 0.3406 0.1067 3.191 0.0014 ** --8 -0.1465 0.1379 -1.062 0.2882 --7 0.1393 0.3326 0.419 0.6754 --6 0.2611 0.1646 1.586 0.1128 --5 -0.0466 0.1181 -0.394 0.6933 --4 0.1224 0.1344 0.911 0.3625 --3 0.0783 0.1576 0.497 0.6194 --2 0.1541 0.0957 1.610 0.1075 -0 0.4453 0.1627 2.737 0.0062 ** -1 0.7074 0.1903 3.716 0.0002 *** -2 0.7642 0.1612 4.739 0.0000 *** -3 0.5525 0.1646 3.357 0.0008 *** -4 0.2367 0.1909 1.240 0.2150 -5 0.6463 0.1313 4.921 0.0000 *** -------------------------------------------------------------------------------------- - -Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1 -===================================================================================== -``` -``` -### BusinessReport.to_dict() - headline + assumption -```json -{ - "effect": 0.5608256172839505, - "se": 0.12081241968043965, - "ci_lower": 0.3240376258251508, - "ci_upper": 0.7976136087427503, - "alpha_was_honored": true, - "alpha_override_caveat": null, - "ci_level": 95, - "p_value": 3.448543002483855e-06, - "is_significant": true, - "near_significance_threshold": false, - "unit": "per 100k population", - "unit_kind": "unknown", - "sign": "positive", - "breakdown_M": null -} -``` -```json -{ - "parallel_trends_variant": "conditional_or_group_time", - "no_anticipation": true, - "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation." -} -``` - ---- diff --git a/docs/validation/validate_br_dr_canonical.py b/docs/validation/validate_br_dr_canonical.py deleted file mode 100644 index cc411551..00000000 --- a/docs/validation/validate_br_dr_canonical.py +++ /dev/null @@ -1,308 +0,0 @@ -"""Run BusinessReport / DiagnosticReport on canonical DiD datasets. - -Writes ``docs/validation/br_dr_canonical_validation.md`` with the -full BR ``summary()`` + ``full_report()`` + selected ``to_dict()`` -blocks for each dataset. The markdown output is the reviewable -artifact; compare it against canonical literature interpretations -and record any divergences in -``docs/validation/br_dr_canonical_findings.md``. - -Purpose: BR/DR gap #4 (real-dataset validation) — synthetic-DGP -tests pass but we haven't checked whether the prose output matches -canonical interpretations of applied work. - -Run via: ``python docs/validation/validate_br_dr_canonical.py``. -""" - -from __future__ import annotations - -import json -import sys -import warnings -from pathlib import Path - -import numpy as np - -from diff_diff import ( - BusinessReport, - CallawaySantAnna, - DifferenceInDifferences, - SunAbraham, -) -from diff_diff.datasets import ( - load_card_krueger, - load_castle_doctrine, - load_mpdta, -) - -OUT_PATH = Path(__file__).parent / "br_dr_canonical_validation.md" - - -def _section(title: str, level: int = 2) -> str: - return "#" * level + " " + title + "\n" - - -def _fence(body: str, lang: str = "") -> str: - return f"```{lang}\n{body.rstrip()}\n```\n" - - -def _dump_block(name: str, block: dict) -> str: - return _fence(json.dumps(block, indent=2, default=str), "json") - - -def _card_krueger_section() -> str: - """Card & Krueger (1994) minimum wage — classic 2x2 DiD. - - Canonical finding: no significant negative effect of NJ minimum-wage - increase on fast-food employment; published ATT ~ +2.8 FTE or - approximately 0.6 FTE per store depending on specification. CI - includes zero; direction positive. - """ - parts = [_section("Card & Krueger (1994): NJ/PA minimum wage", 2)] - ck = load_card_krueger() - # Reshape wide -> long per the docstring example. - ck_long = ck.melt( - id_vars=["store_id", "state", "treated"], - value_vars=["emp_pre", "emp_post"], - var_name="period", - value_name="employment", - ) - ck_long["post"] = (ck_long["period"] == "emp_post").astype(int) - did = DifferenceInDifferences() - fit = did.fit(ck_long, outcome="employment", treatment="treated", time="post") - br = BusinessReport( - fit, - outcome_label="FTE employment", - outcome_unit="FTE", - outcome_direction="higher_is_better", - business_question="Did the NJ minimum-wage increase reduce fast-food employment?", - treatment_label="the NJ minimum-wage increase", - auto_diagnostics=False, # 2x2 PT needs manual column kwargs; run without for now - ) - parts.append( - "Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA " - "(control, $4.25 throughout). Outcome: full-time equivalent employment. " - f"N={len(ck)} stores.\n\n" - ) - parts.append( - "Canonical interpretation: no significant disemployment effect of the " - "minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). " - "The famous finding was that the CI included zero.\n\n" - ) - parts.append(_section("BusinessReport.summary()", 3)) - parts.append(_fence(br.summary())) - parts.append(_section("BusinessReport.full_report()", 3)) - parts.append(_fence(br.full_report(), "markdown")) - parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3)) - d = br.to_dict() - parts.append(_dump_block("headline", d.get("headline", {}))) - parts.append(_dump_block("assumption", d.get("assumption", {}))) - parts.append(_dump_block("caveats", d.get("caveats", []))) - parts.append("\n---\n") - return "".join(parts) - - -def _mpdta_section() -> str: - """Callaway-Sant'Anna benchmark (mpdta): county-level log employment - under staggered minimum-wage increases. - - Canonical finding: CS aggregate ATT roughly -0.04 to -0.05 on log - employment (i.e., ~4-5% employment decline for treated counties). - Group-level ATT(g,t) shown in CS Figure 1. - """ - parts = [_section("Callaway-Sant'Anna benchmark (mpdta)", 2)] - df = load_mpdta() - cs = CallawaySantAnna(base_period="universal") - fit = cs.fit( - df, - outcome="lemp", - unit="countyreal", - time="year", - first_treat="first_treat", - aggregate="event_study", - ) - br = BusinessReport( - fit, - outcome_label="Log employment", - outcome_unit="log_points", - outcome_direction="higher_is_better", - business_question="Did minimum-wage increases reduce county employment?", - treatment_label="the state-level minimum wage increase", - data=df, - outcome="lemp", - unit="countyreal", - time="year", - first_treat="first_treat", - ) - parts.append( - "Data: simulated county-level panel from R `did` package (Callaway & " - "Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: " - "log employment (`lemp`).\n\n" - ) - parts.append( - "Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) " - "on treated counties; group-specific ATT(g,t) negative across cohorts. " - "See CS (2021) Figures 1-2.\n\n" - ) - parts.append(_section("BusinessReport.summary()", 3)) - parts.append(_fence(br.summary())) - parts.append(_section("BusinessReport.full_report()", 3)) - parts.append(_fence(br.full_report(), "markdown")) - parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3)) - d = br.to_dict() - parts.append(_dump_block("headline", d.get("headline", {}))) - parts.append(_dump_block("assumption", d.get("assumption", {}))) - parts.append(_dump_block("pre_trends", d.get("pre_trends", {}))) - parts.append(_dump_block("sensitivity", d.get("sensitivity", {}))) - parts.append(_dump_block("caveats", d.get("caveats", []))) - parts.append("\n---\n") - return "".join(parts) - - -def _castle_doctrine_section() -> str: - """Cheng & Hoekstra (2013): staggered adoption of Castle Doctrine laws. - - Canonical finding: ~8% increase in homicide rates in adopting - states; no deterrent effect on burglary or other crimes. - """ - parts = [_section("Cheng & Hoekstra (2013): Castle Doctrine laws", 2)] - df = load_castle_doctrine() - # CS with never-treated as control; outcome = homicide rate. - cs = CallawaySantAnna(base_period="universal", control_group="never_treated") - fit = cs.fit( - df, - outcome="homicide_rate", - unit="state", - time="year", - first_treat="first_treat", - aggregate="event_study", - ) - br = BusinessReport( - fit, - outcome_label="Homicide rate (per 100k)", - outcome_unit="per 100k population", - outcome_direction="lower_is_better", - business_question=( - "Did Castle Doctrine law adoption change state homicide rates?" - ), - treatment_label="Castle Doctrine law adoption", - data=df, - outcome="homicide_rate", - unit="state", - time="year", - first_treat="first_treat", - ) - parts.append( - "Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. " - "Outcome: homicide rate per 100k population.\n\n" - ) - parts.append( - "Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in " - "homicide rates in states that adopted Castle Doctrine (no deterrent " - "effect; if anything, an escalation).\n\n" - ) - parts.append(_section("BusinessReport.summary()", 3)) - parts.append(_fence(br.summary())) - parts.append(_section("BusinessReport.full_report()", 3)) - parts.append(_fence(br.full_report(), "markdown")) - parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3)) - d = br.to_dict() - parts.append(_dump_block("headline", d.get("headline", {}))) - parts.append(_dump_block("assumption", d.get("assumption", {}))) - parts.append(_dump_block("pre_trends", d.get("pre_trends", {}))) - parts.append(_dump_block("sensitivity", d.get("sensitivity", {}))) - parts.append(_dump_block("caveats", d.get("caveats", []))) - parts.append("\n---\n") - return "".join(parts) - - -def _castle_doctrine_sun_abraham_section() -> str: - """Same Castle Doctrine dataset but run through Sun-Abraham, as a - cross-estimator consistency check. If SA and CS narrate the same - canonical finding differently, that's a BR/DR source-faithfulness - issue. - """ - parts = [_section("Castle Doctrine under Sun-Abraham (2021)", 2)] - df = load_castle_doctrine() - sa = SunAbraham() - fit = sa.fit( - df, - outcome="homicide_rate", - unit="state", - time="year", - first_treat="first_treat", - ) - br = BusinessReport( - fit, - outcome_label="Homicide rate (per 100k)", - outcome_unit="per 100k population", - outcome_direction="lower_is_better", - business_question=( - "Did Castle Doctrine law adoption change state homicide rates?" - ), - treatment_label="Castle Doctrine law adoption", - data=df, - outcome="homicide_rate", - unit="state", - time="year", - first_treat="first_treat", - ) - parts.append( - "Same dataset and research question; different estimator. Testing BR/DR " - "cross-estimator narrative consistency.\n\n" - ) - parts.append(_section("BusinessReport.summary()", 3)) - parts.append(_fence(br.summary())) - parts.append(_section("BusinessReport.full_report()", 3)) - parts.append(_fence(br.full_report(), "markdown")) - parts.append(_section("BusinessReport.to_dict() - headline + assumption", 3)) - d = br.to_dict() - parts.append(_dump_block("headline", d.get("headline", {}))) - parts.append(_dump_block("assumption", d.get("assumption", {}))) - parts.append("\n---\n") - return "".join(parts) - - -def main() -> int: - warnings.filterwarnings("ignore") - np.random.seed(42) - - header = ( - "# BR / DR canonical-dataset validation\n\n" - "Output of ``docs/validation/validate_br_dr_canonical.py``. Each section " - "runs BusinessReport (and its auto-constructed DiagnosticReport) on a " - "canonical DiD dataset and dumps summary + full_report + selected " - "to_dict blocks. The purpose is to compare BR's prose output against " - "published canonical interpretations and record divergences in " - "``br_dr_canonical_findings.md``.\n\n" - "This file is regenerable; do not hand-edit.\n\n" - "Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 " - "benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA).\n\n" - "---\n\n" - ) - - sections = [header] - for name, fn in ( - ("card_krueger", _card_krueger_section), - ("mpdta", _mpdta_section), - ("castle_doctrine_cs", _castle_doctrine_section), - ("castle_doctrine_sa", _castle_doctrine_sun_abraham_section), - ): - print(f"Running {name} ...", file=sys.stderr) - try: - sections.append(fn()) - except Exception as exc: # noqa: BLE001 - sections.append( - _section(f"{name} (ERROR)", 2) - + _fence(f"{type(exc).__name__}: {exc}") - + "\n---\n" - ) - print(f" {type(exc).__name__}: {exc}", file=sys.stderr) - - OUT_PATH.write_text("".join(sections)) - print(f"Wrote {OUT_PATH}", file=sys.stderr) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tests/test_br_dr_canonical_datasets.py b/tests/test_br_dr_canonical_datasets.py new file mode 100644 index 00000000..3fe7381c --- /dev/null +++ b/tests/test_br_dr_canonical_datasets.py @@ -0,0 +1,356 @@ +"""Canonical-dataset regression guards for BusinessReport / DiagnosticReport. + +Closes BR/DR foundation gap #4 (real-dataset validation): the risk was +that BR/DR's prose could silently diverge from canonical interpretations +of applied work without synthetic-DGP tests catching it. These tests +run BR on four canonical fits and assert direction / verdict / tier +properties that should hold regardless of small data-aggregation +differences between the bundled dataset and the published author +sample. + +Assertions are property-level, not exact-match: +- Sign of the point estimate. +- Whether the CI includes zero. +- Pre-trends verdict bin (``no_detected_violation`` vs + ``clear_violation``). +- HonestDiD sensitivity tier (robust vs fragile, via ``breakdown_M``). +- Cross-estimator consistency (CS and SA produce the same direction + and verdict on the same data). + +These tests use the ``_construct_*`` fallback data from +``diff_diff.datasets`` to avoid network dependency in CI. The +construction targets match the published summary statistics, so +canonical-direction / canonical-verdict properties hold. +""" + +from __future__ import annotations + +import warnings + +import pytest + +from diff_diff import ( + BusinessReport, + CallawaySantAnna, + DifferenceInDifferences, + SunAbraham, +) +from diff_diff.datasets import ( + _construct_card_krueger_data, + _construct_castle_doctrine_data, + _construct_mpdta_data, +) + + +@pytest.fixture(scope="module") +def card_krueger_long(): + """Card-Krueger dataset reshaped wide -> long for DiD fitting.""" + warnings.filterwarnings("ignore") + ck = _construct_card_krueger_data() + ck_long = ck.melt( + id_vars=["store_id", "state", "treated"], + value_vars=["emp_pre", "emp_post"], + var_name="period", + value_name="employment", + ) + ck_long["post"] = (ck_long["period"] == "emp_post").astype(int) + return ck_long + + +@pytest.fixture(scope="module") +def mpdta_panel(): + """Callaway-Sant'Anna benchmark (mpdta) as constructed by the fallback.""" + warnings.filterwarnings("ignore") + return _construct_mpdta_data() + + +@pytest.fixture(scope="module") +def castle_panel(): + """Cheng-Hoekstra Castle Doctrine dataset as constructed by the fallback.""" + warnings.filterwarnings("ignore") + return _construct_castle_doctrine_data() + + +class TestCardKruegerCanonicalDirection: + """Card & Krueger (1994): NJ minimum-wage increase vs PA control. + + Canonical finding: no significant disemployment effect; published + ATT is positive (~+0.59 FTE per store) but the CI includes zero. + """ + + def test_no_significant_disemployment(self, card_krueger_long): + did = DifferenceInDifferences().fit( + card_krueger_long, + outcome="employment", + treatment="treated", + time="post", + ) + br = BusinessReport( + did, + outcome_label="FTE employment", + outcome_unit="FTE", + treatment_label="the NJ minimum-wage increase", + outcome_direction="higher_is_better", + auto_diagnostics=False, + ) + h = br.to_dict()["headline"] + # Canonical: positive sign (no disemployment, if anything a + # small positive lift). + assert h["sign"] == "positive", ( + f"Card-Krueger canonical finding is a positive ATT; got " + f"sign={h['sign']!r}, effect={h['effect']!r}" + ) + # Canonical: CI includes zero -> not statistically significant. + assert h["ci_lower"] < 0 < h["ci_upper"], ( + f"Card-Krueger canonical finding is CI includes zero; got " + f"[{h['ci_lower']}, {h['ci_upper']}]" + ) + assert h["is_significant"] is False + # BR prose must name this in stakeholder-readable language. + summary = br.summary().lower() + assert "consistent with no effect" in summary, ( + f"BR summary must report 'consistent with no effect' on " + f"Card-Krueger. Got: {summary!r}" + ) + + def test_treatment_label_abbreviation_preserved(self, card_krueger_long): + """The ``NJ`` abbreviation in the treatment label must survive + BR's sentence capitalization (regression for the + ``str.capitalize()`` bug surfaced by this dataset). + """ + did = DifferenceInDifferences().fit( + card_krueger_long, + outcome="employment", + treatment="treated", + time="post", + ) + br = BusinessReport( + did, + outcome_label="FTE employment", + treatment_label="the NJ minimum-wage increase", + auto_diagnostics=False, + ) + assert "The NJ minimum-wage increase" in br.headline() + + +class TestMpdtaCanonicalDirection: + """Callaway-Sant'Anna benchmark (mpdta): staggered minimum-wage + increases, log employment outcome. + + Canonical finding: aggregate ATT is negative; the published fit is + robust under HonestDiD sensitivity; pre-trends do not reject + parallel trends. + """ + + def test_negative_att_robust_sensitivity_clean_pretrends(self, mpdta_panel): + cs = CallawaySantAnna(base_period="universal").fit( + mpdta_panel, + outcome="lemp", + unit="countyreal", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Log employment", + outcome_unit="log_points", + treatment_label="the state-level minimum wage increase", + outcome_direction="higher_is_better", + data=mpdta_panel, + outcome="lemp", + unit="countyreal", + time="year", + first_treat="first_treat", + ) + d = br.to_dict() + h = d["headline"] + # Canonical direction: ATT on log employment is negative. + assert ( + h["sign"] == "negative" + ), f"mpdta canonical finding is negative ATT; got sign={h['sign']!r}" + # Canonical robustness: HonestDiD breakdown M > 1 means the + # result survives violations at least as large as the observed + # pre-period variation. + bkd = h.get("breakdown_M") + assert isinstance(bkd, (int, float)) and bkd > 1.0, ( + f"mpdta canonical finding is robust sensitivity " + f"(breakdown_M > 1.0); got breakdown_M={bkd!r}" + ) + # Canonical pre-trends: do not reject PT. + pt = d["pre_trends"] + assert pt.get("verdict") == "no_detected_violation", ( + f"mpdta canonical finding is clean pre-trends " + f"(no_detected_violation); got verdict={pt.get('verdict')!r}" + ) + + +class TestCastleDoctrineCanonicalDirection: + """Cheng & Hoekstra (2013): Castle Doctrine / Stand Your Ground + laws staggered across U.S. states. + + Canonical finding: ~8% INCREASE in homicide rates (no deterrent + effect; if anything, escalation). Pre-trends violation is a + well-known issue with this dataset; HonestDiD sensitivity + flags the headline as fragile. + """ + + def test_cs_positive_att_clear_violation_fragile_sensitivity(self, castle_panel): + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Homicide rate (per 100k)", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + d = br.to_dict() + h = d["headline"] + # Canonical direction: homicides went UP (positive ATT). + assert h["sign"] == "positive", ( + f"Castle Doctrine canonical finding is positive ATT (homicide " + f"escalation); got sign={h['sign']!r}" + ) + # Canonical: clear PT violation on this dataset. + pt = d["pre_trends"] + assert pt.get("verdict") == "clear_violation", ( + f"Castle Doctrine canonical finding is clear PT violation; " + f"got verdict={pt.get('verdict')!r}" + ) + # Canonical: HonestDiD flags fragility given the PT violation. + sens = d["sensitivity"] + assert sens.get("status") == "computed" + bkd = sens.get("breakdown_M") + assert isinstance(bkd, (int, float)) and bkd < 0.5, ( + f"Castle Doctrine canonical finding is fragile sensitivity " + f"(breakdown_M < 0.5); got breakdown_M={bkd!r}" + ) + + def test_treatment_label_proper_noun_preserved(self, castle_panel): + """ "Castle Doctrine" must survive BR's sentence capitalization + (regression for the ``str.capitalize()`` bug surfaced by this + dataset). + """ + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Homicide rate (per 100k)", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + auto_diagnostics=False, + ) + assert "Castle Doctrine law adoption" in br.headline() + + def test_breakdown_m_zero_uses_smallest_grid_point_wording(self, castle_panel): + """Castle Doctrine's fragile sensitivity surfaced a + ``breakdown_M == 0`` edge case in BR's summary wording. The + summary must not quote ``0x the pre-period variation``; it + must use the smallest-grid-point phrasing. + """ + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Homicide rate (per 100k)", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + summary = br.summary() + bkd = br.to_dict()["headline"].get("breakdown_M") + # Sanity: this dataset actually produces the edge case. + assert isinstance(bkd, (int, float)) and bkd <= 0.05, ( + f"This test assumes Castle Doctrine + CS produces " + f"breakdown_M <= 0.05; if not, the dataset or estimator " + f"changed. Got breakdown_M={bkd!r}" + ) + # Must not render the degenerate "0x the pre-period variation" + # wording. + assert "0x" not in summary, ( + f"Summary must not quote ``0x`` multiplier on edge-case " f"breakdown. Got: {summary!r}" + ) + assert "smallest parallel-trends violations" in summary + + +class TestCastleDoctrineCrossEstimatorConsistency: + """Running the same Castle Doctrine dataset through CS and SA must + produce consistent direction + PT verdict. SA is a natural + cross-check on the CS finding. + """ + + def test_sa_agrees_with_cs_on_direction_and_pt(self, castle_panel): + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + sa = SunAbraham().fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + br_cs = BusinessReport( + cs, + outcome_label="Homicide rate", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + br_sa = BusinessReport( + sa, + outcome_label="Homicide rate", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + # Direction must agree: both positive (homicides up). + assert br_cs.to_dict()["headline"]["sign"] == br_sa.to_dict()["headline"]["sign"] + assert br_cs.to_dict()["headline"]["sign"] == "positive" + # PT verdict must agree on the clear-violation bin (both + # estimators read the same underlying pre-period coefficients). + assert ( + br_cs.to_dict()["pre_trends"]["verdict"] + == br_sa.to_dict()["pre_trends"]["verdict"] + == "clear_violation" + ) From 6e24014c3077052c39f35adcef2e4961f79b2ec2 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 20:39:03 -0400 Subject: [PATCH 3/3] Address PR #341 R1: gate HonestDiD "smallest grid point" wording on evaluated grid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R1 caught a semantic bug in the round-1 canonical-validation wording fix. ``breakdown_M`` is the smallest M at which the robust CI includes zero — an interpolated threshold between grid points — not a claim about any specific grid point. Keying the "smallest grid point fails" wording off ``breakdown_M <= 0.05`` was wrong: on a grid starting at M=0 where the smallest evaluated point is still robust (CI excludes zero), a small ``breakdown_M=0.03`` means fragility emerges BETWEEN grid points, not at M=0. Fix (both BR and DR): - Added a ``_smallest_failing_grid_m`` helper (paired helpers in ``business_report.py`` and ``diagnostic_report.py``, intentionally duplicated with cross-reference comments per the parity rule from ``feedback_cross_surface_parity_audit.md``). - Helper returns the smallest evaluated M on the grid if that point has ``robust_to_zero == False``, else ``None``. - Fragile-sensitivity wording now fires "smallest M evaluated on the sensitivity grid (M = X)" ONLY when the helper returns a value; otherwise falls through to the numeric multiplier ``{bkd:.2g}x``. - Castle Doctrine (Cheng-Hoekstra 2013) CS fit: grid starts at M=0.5, every point non-robust — new wording quotes "(M = 0.5)" instead of "0x the pre-period variation". - Reviewer's counterexample (grid ``[0, 0.25, ...]`` with bkd=0.03, smallest point robust): wording falls through to "0.03x the pre-period variation", not "smallest grid point". Tests: - Rewrote ``TestCanonicalValidationSurfaceFixes`` on the BR side to build sensitivity schemas with explicit grids. Added paired cases: (a) smallest M fails, assert "smallest M evaluated"; (b) smallest M robust, breakdown 0.03, assert multiplier wording is used. - Added ``TestDRFragilePhrasingIsGridAware`` on the DR side mirroring the same paired cases against ``_render_overall_interpretation``. - Updated the Castle Doctrine canonical-dataset regression test to assert ``"M = 0.5"`` appears (actual smallest evaluated grid point). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 53 +++++++++--- diff_diff/diagnostic_report.py | 53 +++++++++--- tests/test_br_dr_canonical_datasets.py | 21 +++-- tests/test_business_report.py | 106 +++++++++++++++++------ tests/test_diagnostic_report.py | 115 +++++++++++++++++++++++++ 5 files changed, 289 insertions(+), 59 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index bbbeadc7..36f19780 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1854,6 +1854,30 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str: return "the confidence interval includes zero; the data are consistent with no effect" +def _smallest_failing_grid_m(sens: Dict[str, Any]) -> Optional[float]: + """If the smallest evaluated M on the HonestDiD sensitivity grid + already has the robust CI including zero, return that M. Returns + ``None`` when the grid is missing or when the smallest evaluated + point is still robust — in the latter case ``breakdown_M`` is an + interpolated threshold between grid points, not a statement about + the smallest grid point itself. + + Matches the twin helper in ``diagnostic_report.py``; keep the two + in sync for cross-surface parity. + """ + grid_points = sens.get("grid") or [] + sorted_grid = sorted( + (p for p in grid_points if isinstance(p.get("M"), (int, float))), + key=lambda p: p["M"], + ) + if not sorted_grid: + return None + smallest = sorted_grid[0] + if not smallest.get("robust_to_zero", True): + return float(smallest["M"]) + return None + + def _sentence_first_upper(text: str) -> str: """Uppercase only the first character of ``text``, preserving all other casing. Unlike ``str.capitalize()``, which lowercases every @@ -2115,19 +2139,26 @@ def _render_summary(schema: Dict[str, Any]) -> str: f"pre-period variation." ) elif isinstance(bkd, (int, float)): - # Round-1 BR/DR canonical-validation (2026-04-19): - # ``breakdown_M`` at or near zero reads as "0x the - # pre-period variation" which is a degenerate sentence - # (zero-times-anything is zero). The correct wording when - # the CI includes zero at the smallest grid point is to - # say the result is fragile to essentially any nonzero - # violation, not to quote the ``0x`` multiplier. - if bkd <= 0.05: + # Round-1 BR/DR canonical-validation (2026-04-19) then + # tightened per CI review on PR #341 R1: + # ``breakdown_M`` is the smallest M at which the robust + # CI includes zero (interpolated between grid points) — + # not a claim about any specific grid point. Earlier fix + # keyed off ``bkd <= 0.05`` which incorrectly asserted + # "smallest grid point fails" even for grids that start + # at M=0 where the smallest evaluated point is still + # robust (e.g., grid=[0, 0.25, ...] with bkd=0.03). The + # "smallest grid point" wording is only accurate when + # the smallest evaluated M on the grid itself fails + # (``robust_to_zero == False``); otherwise fall through + # to the numeric multiplier. + smallest_failed_m = _smallest_failing_grid_m(sens) + if smallest_failed_m is not None: sentences.append( "HonestDiD: the result is fragile — the confidence " - "interval includes zero even at the smallest " - "parallel-trends violations on the sensitivity " - "grid." + "interval includes zero even at the smallest M " + f"evaluated on the sensitivity grid (M = " + f"{smallest_failed_m:.2g})." ) else: sentences.append( diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 21ce917d..6ec87b2a 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -2780,6 +2780,32 @@ def _collect_pre_period_coefs( return results_list, n_dropped_undefined +def _smallest_failing_grid_m_dr(sens: Dict[str, Any]) -> Optional[float]: + """Return the smallest evaluated M on the HonestDiD sensitivity + grid if it already has the robust CI including zero, else ``None``. + Matches ``business_report._smallest_failing_grid_m`` — both helpers + must stay in sync for cross-surface parity. See PR #341 R1 review. + + ``breakdown_M`` is an interpolated threshold between grid points, + so "the smallest grid point fails" is only a valid claim when the + smallest actually-evaluated M has ``robust_to_zero == False``. On + a grid that starts at M=0 where the smallest evaluated point is + still robust, the breakdown value is information about what + happens between grid points — not at the smallest grid point. + """ + grid_points = sens.get("grid") or [] + sorted_grid = sorted( + (p for p in grid_points if isinstance(p.get("M"), (int, float))), + key=lambda p: p["M"], + ) + if not sorted_grid: + return None + smallest = sorted_grid[0] + if not smallest.get("robust_to_zero", True): + return float(smallest["M"]) + return None + + def _pt_verdict(p: Optional[float]) -> str: """Map a pre-trends joint p-value to the three-bin verdict enum. @@ -3118,22 +3144,25 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str f"pre-period variation." ) else: - # Round-1 BR/DR canonical-validation (2026-04-19): the - # "fragile — CI includes zero once violations reach 0x - # the pre-period variation" wording is a degenerate - # sentence at the ``breakdown_M == 0`` edge case - # surfaced by the Cheng-Hoekstra (2013) Castle Doctrine - # dataset. Mirror BR's fix: when the breakdown value is - # at or near zero, say the CI includes zero at the - # smallest grid point rather than quoting a ``0x`` - # multiplier. + # Round-1 BR/DR canonical-validation (2026-04-19) then + # tightened per CI review on PR #341 R1: the "smallest + # grid point" wording is only semantically correct when + # the smallest M actually evaluated on the sensitivity + # grid has ``robust_to_zero == False``. ``breakdown_M`` + # is the interpolated threshold between grid points, so + # a small breakdown value on a grid starting at M=0 + # (where the smallest evaluated point is still robust) + # would previously have been narrated as "smallest grid + # point fails" — stronger than the evaluated grid + # supports. Mirror BR's fix: check the grid directly. if isinstance(bkd, (int, float)): - if bkd <= 0.05: + smallest_failed_m = _smallest_failing_grid_m_dr(sens) + if smallest_failed_m is not None: sentences.append( "HonestDiD sensitivity: the result is fragile — " "the confidence interval includes zero even at " - "the smallest parallel-trends violations on the " - "sensitivity grid." + "the smallest M evaluated on the sensitivity " + f"grid (M = {smallest_failed_m:.2g})." ) else: sentences.append( diff --git a/tests/test_br_dr_canonical_datasets.py b/tests/test_br_dr_canonical_datasets.py index 3fe7381c..10998a61 100644 --- a/tests/test_br_dr_canonical_datasets.py +++ b/tests/test_br_dr_canonical_datasets.py @@ -259,11 +259,14 @@ def test_treatment_label_proper_noun_preserved(self, castle_panel): ) assert "Castle Doctrine law adoption" in br.headline() - def test_breakdown_m_zero_uses_smallest_grid_point_wording(self, castle_panel): + def test_breakdown_m_zero_uses_smallest_m_evaluated_wording(self, castle_panel): """Castle Doctrine's fragile sensitivity surfaced a - ``breakdown_M == 0`` edge case in BR's summary wording. The - summary must not quote ``0x the pre-period variation``; it - must use the smallest-grid-point phrasing. + ``breakdown_M == 0`` edge case. The default HonestDiD grid + starts at M=0.5, and every grid point has the robust CI + including zero — so the smallest-M-evaluated wording is + semantically accurate here. BR's summary must say ``smallest + M evaluated on the sensitivity grid (M = 0.5)`` and must not + quote the degenerate ``0x`` multiplier. """ cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( castle_panel, @@ -292,12 +295,14 @@ def test_breakdown_m_zero_uses_smallest_grid_point_wording(self, castle_panel): f"breakdown_M <= 0.05; if not, the dataset or estimator " f"changed. Got breakdown_M={bkd!r}" ) - # Must not render the degenerate "0x the pre-period variation" - # wording. - assert "0x" not in summary, ( + # Must not render the degenerate ``0x`` multiplier. + assert "0x the pre-period variation" not in summary, ( f"Summary must not quote ``0x`` multiplier on edge-case " f"breakdown. Got: {summary!r}" ) - assert "smallest parallel-trends violations" in summary + # Must name the smallest evaluated grid point (0.5 for the + # default grid). + assert "smallest M evaluated on the sensitivity grid" in summary + assert "M = 0.5" in summary class TestCastleDoctrineCrossEstimatorConsistency: diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 77e059e3..8ec51c2b 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -4004,11 +4004,28 @@ class CallawaySantAnnaResults: stub.inference_method = "analytical" return stub - def _fragile_dr_schema(self, breakdown_m: float): + def _fragile_dr_schema(self, breakdown_m: float, grid=None): """Build a fake DiagnosticReportResults whose ``sensitivity`` - block carries the given ``breakdown_M`` value.""" + block carries the given ``breakdown_M`` value and grid. Pass + ``grid`` as a list of ``{"M": float, "robust_to_zero": bool}`` + dicts (other fields populated with plausible values). + """ from diff_diff.diagnostic_report import DiagnosticReportResults + grid = grid if grid is not None else [] + # Populate optional CI / bound fields so grid entries match + # the schema the BR/DR runners actually emit. + grid_full = [ + { + "M": row["M"], + "ci_lower": row.get("ci_lower", 0.0), + "ci_upper": row.get("ci_upper", 0.0), + "bound_lower": row.get("bound_lower", 0.0), + "bound_upper": row.get("bound_upper", 0.0), + "robust_to_zero": row["robust_to_zero"], + } + for row in grid + ] schema = { "schema_version": "1.0", "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"}, @@ -4020,7 +4037,7 @@ def _fragile_dr_schema(self, breakdown_m: float): "method": "relative_magnitude", "breakdown_M": breakdown_m, "conclusion": "fragile", - "grid": [], + "grid": grid_full, }, "placebo": {"status": "skipped", "reason": "stub"}, "bacon": {"status": "skipped", "reason": "stub"}, @@ -4097,50 +4114,83 @@ def test_treatment_label_preserves_proper_noun_case(self): "Castle Doctrine law adoption" in headline ), f"Proper-noun casing must be preserved. Got: {headline!r}" - def test_breakdown_m_zero_uses_smallest_grid_point_wording(self): - """Cheng-Hoekstra Castle Doctrine produces ``breakdown_M == 0`` - under HonestDiD. The old wording "violations reach 0x the - pre-period variation" reads as a degenerate zero-times-variation - sentence. The fix switches to "includes zero even at the - smallest parallel-trends violations on the sensitivity grid" - for breakdown values at or near zero. + def test_smallest_grid_m_fails_uses_smallest_grid_point_wording(self): + """When the smallest M actually evaluated on the grid has + ``robust_to_zero == False``, the "smallest M evaluated" wording + is semantically correct. This is the Cheng-Hoekstra Castle + Doctrine pattern: default grid ``[0.5, 1.0, 1.5, 2.0]`` with + M=0.5 already non-robust. """ stub = self._cs_like_stub_with_zero_breakdown() - dr = self._fragile_dr_schema(breakdown_m=0.0) + dr = self._fragile_dr_schema( + breakdown_m=0.0, + grid=[ + {"M": 0.5, "robust_to_zero": False}, + {"M": 1.0, "robust_to_zero": False}, + {"M": 1.5, "robust_to_zero": False}, + {"M": 2.0, "robust_to_zero": False}, + ], + ) br = BusinessReport(stub, diagnostics=dr) summary = br.summary() - assert "0x" not in summary, ( - f"Summary must not render ``0x the pre-period variation``; " - f"that reads as zero-times-anything. Got: {summary!r}" - ) - assert "smallest parallel-trends violations" in summary, ( - f"Summary must use the smallest-grid-point wording at " - f"breakdown_M == 0. Got: {summary!r}" + # Must not render the degenerate multiplier form on the + # zero-breakdown case. + assert ( + "0x the pre-period variation" not in summary + ), f"Summary must not quote ``0x`` multiplier. Got: {summary!r}" + # New wording quotes the actual smallest evaluated M. + assert "smallest M evaluated on the sensitivity grid" in summary, ( + f"Summary must use the smallest-M-evaluated wording when the " + f"smallest grid point actually fails. Got: {summary!r}" ) - - def test_breakdown_m_small_positive_still_uses_smallest_grid_point_wording(self): - """Breakdown values just above zero (e.g., 0.03) should also - route through the smallest-grid-point wording — quoting - ``0.03x`` to a stakeholder is equally uninformative. + assert "M = 0.5" in summary + + def test_smallest_grid_m_robust_falls_through_to_multiplier_wording(self): + """CI review on PR #341 R1: ``breakdown_M`` is the interpolated + threshold between grid points, not a claim about any specific + grid point. On a grid starting at M=0 where the smallest + evaluated point is still robust, a small ``breakdown_M=0.03`` + does NOT mean the smallest grid point failed — it means + fragility emerges between grid points. The correct wording is + the numeric multiplier, not the smallest-grid-point claim. """ stub = self._cs_like_stub_with_zero_breakdown() - dr = self._fragile_dr_schema(breakdown_m=0.03) + dr = self._fragile_dr_schema( + breakdown_m=0.03, + grid=[ + # Smallest evaluated M (0) is still robust: CI excludes + # zero. Breakdown is interpolated somewhere between M=0 + # and M=0.25. + {"M": 0.0, "robust_to_zero": True}, + {"M": 0.25, "robust_to_zero": False}, + {"M": 0.5, "robust_to_zero": False}, + ], + ) br = BusinessReport(stub, diagnostics=dr) summary = br.summary() - assert "smallest parallel-trends violations" in summary - assert "0.03x" not in summary + # Must NOT claim the smallest grid point failed — it didn't. + assert "smallest M evaluated on the sensitivity grid" not in summary, ( + f"Summary must not assert ``smallest M evaluated fails`` when the " + f"smallest grid point is still robust. Got: {summary!r}" + ) + # Correct wording quotes the numeric multiplier. + assert "0.03x" in summary, ( + f"Fragile fit with robust smallest-M should quote the interpolated " + f"breakdown multiplier. Got: {summary!r}" + ) def test_breakdown_m_normal_keeps_multiplier_wording(self): """Breakdown values at the usual fragile-but-nonzero range (e.g., 0.3) must still quote the ``0.3x`` multiplier — the - smallest-grid-point wording is only for the degenerate tail. + smallest-M-evaluated wording is only for grids whose smallest + actually-evaluated point is already non-robust. """ stub = self._cs_like_stub_with_zero_breakdown() dr = self._fragile_dr_schema(breakdown_m=0.3) br = BusinessReport(stub, diagnostics=dr) summary = br.summary() assert "0.3x" in summary - assert "smallest parallel-trends violations" not in summary + assert "smallest M evaluated on the sensitivity grid" not in summary class TestBaconCaveatEstimatorAware: diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 2f3800a6..ba473a56 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1891,6 +1891,121 @@ def test_full_report_has_headers(self, cs_fit): assert "## HonestDiD sensitivity" in md +class TestDRFragilePhrasingIsGridAware: + """CI review on PR #341 R1: DR's ``overall_interpretation`` + fragile-sensitivity sentence must be gated on the actual + evaluated grid, not just on ``breakdown_M``. ``breakdown_M`` is + the interpolated threshold between grid points; "smallest grid + point fails" is only a valid claim when the smallest actually- + evaluated M has ``robust_to_zero == False``. Mirrors the BR test + class ``TestCanonicalValidationSurfaceFixes``. + """ + + @staticmethod + def _grid(breakdown_m, grid_rows): + """Build a sensitivity block with a populated grid, matching + the schema ``_check_sensitivity`` emits.""" + return { + "status": "ran", + "method": "relative_magnitude", + "breakdown_M": breakdown_m, + "conclusion": "fragile", + "grid": [ + { + "M": row["M"], + "ci_lower": 0.0, + "ci_upper": 0.0, + "bound_lower": 0.0, + "bound_upper": 0.0, + "robust_to_zero": row["robust_to_zero"], + } + for row in grid_rows + ], + } + + def _render(self, sens_block): + """Call the DR overall-interpretation renderer on a minimal + schema that has our sensitivity block and otherwise skipped + sections (so the fragile-sensitivity branch fires alone).""" + from diff_diff.diagnostic_report import _render_overall_interpretation + + schema = { + "schema_version": "1.0", + "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"}, + "headline_metric": { + "status": "ran", + "effect": 0.5, + "se": 0.1, + "p_value": 0.0, + "ci_lower": 0.3, + "ci_upper": 0.7, + "is_significant": True, + "sign": "positive", + "alpha": 0.05, + }, + "parallel_trends": {"status": "skipped", "reason": "stub"}, + "pretrends_power": {"status": "skipped", "reason": "stub"}, + "sensitivity": sens_block, + "placebo": {"status": "skipped", "reason": "stub"}, + "bacon": {"status": "skipped", "reason": "stub"}, + "design_effect": {"status": "skipped", "reason": "stub"}, + "heterogeneity": {"status": "skipped", "reason": "stub"}, + "epv": {"status": "skipped", "reason": "stub"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "next_steps": [], + } + return _render_overall_interpretation(schema, {}) + + def test_dr_smallest_grid_m_fails_uses_smallest_m_wording(self): + """Castle Doctrine pattern: grid ``[0.5, 1.0, ...]`` with M=0.5 + already non-robust. DR emits "smallest M evaluated (M = 0.5)". + """ + sens = self._grid( + breakdown_m=0.0, + grid_rows=[ + {"M": 0.5, "robust_to_zero": False}, + {"M": 1.0, "robust_to_zero": False}, + ], + ) + prose = self._render(sens) + assert "smallest M evaluated on the sensitivity grid" in prose + assert "M = 0.5" in prose + assert "0x the pre-period variation" not in prose + + def test_dr_smallest_grid_m_robust_falls_through_to_multiplier(self): + """Grid starting at M=0 with smallest point still robust. + ``breakdown_M=0.03`` is the interpolated threshold between + M=0 and M=0.25; DR must NOT claim the smallest grid point + failed (it didn't) and must use the multiplier wording + instead. + """ + sens = self._grid( + breakdown_m=0.03, + grid_rows=[ + {"M": 0.0, "robust_to_zero": True}, + {"M": 0.25, "robust_to_zero": False}, + ], + ) + prose = self._render(sens) + assert "smallest M evaluated on the sensitivity grid" not in prose + assert "0.03x" in prose + + def test_dr_normal_fragile_keeps_multiplier(self): + """Normal fragile value (e.g., 0.3) still quotes the multiplier.""" + sens = self._grid( + breakdown_m=0.3, + grid_rows=[ + {"M": 0.5, "robust_to_zero": True}, + {"M": 1.0, "robust_to_zero": True}, + ], + ) + prose = self._render(sens) + assert "0.3x" in prose + assert "smallest M evaluated on the sensitivity grid" not in prose + + # --------------------------------------------------------------------------- # Public result class # ---------------------------------------------------------------------------