From 73ef44cd927ad947f8d81cfe5cb6ce06fb6cfe88 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sun, 19 Apr 2026 19:06:15 -0400
Subject: [PATCH 1/3] Add BR/DR canonical-dataset validation + two wording
 fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes BR/DR foundation gap #4 (real-dataset validation) from the
external-positioning gap list in ``project_br_dr_foundation.md``.

Validation artifact:

- ``docs/validation/validate_br_dr_canonical.py`` runs BusinessReport
  / DiagnosticReport on Card-Krueger (1994), mpdta (Callaway-Sant'Anna
  2021 benchmark), and Castle Doctrine (Cheng-Hoekstra 2013 under
  both CS and SA), dumping summary + full_report + selected to_dict
  blocks for each.
- ``docs/validation/br_dr_canonical_validation.md`` is the regenerable
  raw output.
- ``docs/validation/br_dr_canonical_findings.md`` is the hand-written
  synthesis: direction / verdict / sensitivity tier all match canonical
  interpretations, with two small wording bugs surfaced and fixed in
  this PR and two larger gaps queued as follow-up (SA HonestDiD
  applicability, target-parameter disambiguation).

Wording fixes:

1. Treatment-label capitalization. ``str.capitalize()`` lowercased
   every character after the first, flattening embedded abbreviations
   (``"the NJ minimum-wage increase"`` → ``"The nj minimum-wage
   increase"``) and proper-noun phrases (``"Castle Doctrine law
   adoption"`` → ``"Castle doctrine law adoption"``). Replaced with a
   ``_sentence_first_upper`` helper that preserves user-supplied
   casing.

2. ``breakdown_M == 0`` phrasing. The HonestDiD fragile sentence
   quoted ``{breakdown_M:.2g}x the pre-period variation``, which
   renders as a degenerate ``0x`` on the exact-zero case surfaced by
   Cheng-Hoekstra. At ``breakdown_M <= 0.05`` (covers 0 and near-zero
   values), both BR's summary and DR's overall_interpretation now say
   "includes zero even at the smallest parallel-trends violations on
   the sensitivity grid" instead.

Tests: 5 new regressions in
``TestCanonicalValidationSurfaceFixes`` covering both fixes + three
boundary cases (exact zero, small positive, normal fragile value).

Not in scope: Favara-Imbs (dCDH reversible-treatment dataset not
bundled), ImputationDiD / TwoStageDiD on canonical data (needed to
exercise the R42 untreated-outcome FE assumption branch on real
data), SA HonestDiD applicability gap. All tracked in the findings
doc for follow-up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/business_report.py                  |  54 +-
 diff_diff/diagnostic_report.py                |  31 +-
 docs/validation/br_dr_canonical_findings.md   | 175 +++++
 docs/validation/br_dr_canonical_validation.md | 723 ++++++++++++++++++
 docs/validation/validate_br_dr_canonical.py   | 308 ++++++++
 tests/test_business_report.py                 | 168 ++++
 6 files changed, 1446 insertions(+), 13 deletions(-)
 create mode 100644 docs/validation/br_dr_canonical_findings.md
 create mode 100644 docs/validation/br_dr_canonical_validation.md
 create mode 100644 docs/validation/validate_br_dr_canonical.py

diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py
index 2445251f..bbbeadc7 100644
--- a/diff_diff/business_report.py
+++ b/diff_diff/business_report.py
@@ -1854,6 +1854,24 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str:
     return "the confidence interval includes zero; the data are consistent with no effect"
 
 
+def _sentence_first_upper(text: str) -> str:
+    """Uppercase only the first character of ``text``, preserving all
+    other casing. Unlike ``str.capitalize()``, which lowercases every
+    character after the first, this keeps user-supplied abbreviations
+    and proper nouns intact.
+
+    Examples
+    --------
+    >>> _sentence_first_upper("the NJ minimum-wage increase")
+    'The NJ minimum-wage increase'
+    >>> _sentence_first_upper("Castle Doctrine law adoption")
+    'Castle Doctrine law adoption'
+    """
+    if not text:
+        return text
+    return text[0].upper() + text[1:]
+
+
 def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str:
     """Return a direction-aware verb for the headline sentence.
 
@@ -1929,7 +1947,16 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str:
         # is not actually available.
         ci_str = " (inference unavailable: confidence interval is undefined for this fit)"
     by_clause = f" by {magnitude}" if effect != 0 else ""
-    return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}."
+    # Round-1 BR/DR canonical-validation (2026-04-19): Python's
+    # ``str.capitalize()`` lowercases everything except the first
+    # character, so ``"the NJ minimum-wage increase".capitalize()``
+    # returns ``"The nj minimum-wage increase"`` — flattening the
+    # ``NJ`` abbreviation. Real canonical datasets (Card-Krueger,
+    # Castle Doctrine) carry proper-noun / acronym tokens in the
+    # user-supplied ``treatment_label``, so preserve user casing and
+    # only ensure the first character is uppercase.
+    treatment_sentence = _sentence_first_upper(treatment)
+    return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}."
 
 
 def _render_summary(schema: Dict[str, Any]) -> str:
@@ -2088,11 +2115,26 @@ def _render_summary(schema: Dict[str, Any]) -> str:
                 f"pre-period variation."
             )
         elif isinstance(bkd, (int, float)):
-            sentences.append(
-                f"HonestDiD: the result is fragile — the confidence interval "
-                f"includes zero once violations reach {bkd:.2g}x the "
-                f"pre-period variation."
-            )
+            # Round-1 BR/DR canonical-validation (2026-04-19):
+            # ``breakdown_M`` at or near zero reads as "0x the
+            # pre-period variation" which is a degenerate sentence
+            # (zero-times-anything is zero). The correct wording when
+            # the CI includes zero at the smallest grid point is to
+            # say the result is fragile to essentially any nonzero
+            # violation, not to quote the ``0x`` multiplier.
+            if bkd <= 0.05:
+                sentences.append(
+                    "HonestDiD: the result is fragile — the confidence "
+                    "interval includes zero even at the smallest "
+                    "parallel-trends violations on the sensitivity "
+                    "grid."
+                )
+            else:
+                sentences.append(
+                    f"HonestDiD: the result is fragile — the confidence "
+                    f"interval includes zero once violations reach {bkd:.2g}x "
+                    f"the pre-period variation."
+                )
 
     # Sample sentence. For fits with a dynamic comparison set (CS /
     # ContinuousDiD / StaggeredTripleDiff / EfficientDiD /
diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py
index 0fe798a9..21ce917d 100644
--- a/diff_diff/diagnostic_report.py
+++ b/diff_diff/diagnostic_report.py
@@ -3118,13 +3118,30 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str
                 f"pre-period variation."
             )
         else:
-            sentences.append(
-                f"HonestDiD sensitivity: the result is fragile — the "
-                f"confidence interval includes zero once violations reach "
-                f"{bkd:.2g}x the pre-period variation."
-                if isinstance(bkd, (int, float))
-                else ""
-            )
+            # Round-1 BR/DR canonical-validation (2026-04-19): the
+            # "fragile — CI includes zero once violations reach 0x
+            # the pre-period variation" wording is a degenerate
+            # sentence at the ``breakdown_M == 0`` edge case
+            # surfaced by the Cheng-Hoekstra (2013) Castle Doctrine
+            # dataset. Mirror BR's fix: when the breakdown value is
+            # at or near zero, say the CI includes zero at the
+            # smallest grid point rather than quoting a ``0x``
+            # multiplier.
+            if isinstance(bkd, (int, float)):
+                if bkd <= 0.05:
+                    sentences.append(
+                        "HonestDiD sensitivity: the result is fragile — "
+                        "the confidence interval includes zero even at "
+                        "the smallest parallel-trends violations on the "
+                        "sensitivity grid."
+                    )
+                else:
+                    sentences.append(
+                        f"HonestDiD sensitivity: the result is fragile — "
+                        f"the confidence interval includes zero once "
+                        f"violations reach {bkd:.2g}x the pre-period "
+                        f"variation."
+                    )
 
     # Sentence 4: one secondary caveat if present.
     bacon = schema.get("bacon") or {}
diff --git a/docs/validation/br_dr_canonical_findings.md b/docs/validation/br_dr_canonical_findings.md
new file mode 100644
index 00000000..7f369573
--- /dev/null
+++ b/docs/validation/br_dr_canonical_findings.md
@@ -0,0 +1,175 @@
+# BR / DR canonical-dataset validation findings
+
+This file records divergences observed in
+``br_dr_canonical_validation.md`` against canonical literature
+interpretations. Generated by running
+``docs/validation/validate_br_dr_canonical.py`` on the bundled
+datasets (Card-Krueger 1994, Callaway-Sant'Anna mpdta benchmark,
+Castle Doctrine / Cheng-Hoekstra 2013). This closes BR/DR
+foundation gap #4 — real-dataset validation — from the
+external-positioning gap list in
+``project_br_dr_foundation.md``.
+
+The goal of the validation exercise is to stress-test BR's prose on
+fits that published applied work has already interpreted, not to
+exactly reproduce their point estimates (the bundled datasets are
+either the R `did` package simulated benchmark or the causaldata
+mirrors, which may differ from the original author data).
+
+## Headline assessment
+
+BR's prose direction, verdicts, and caveat framing match canonical
+interpretations across all four runs:
+
+- **Card-Krueger**: positive sign, CI includes zero, "data consistent
+  with no effect." Matches the famous Card-Krueger finding of no
+  disemployment.
+- **mpdta (CS)**: aggregate ATT negative (-0.021 log-points), pre-trends
+  `no_detected_violation`, HonestDiD `robust_to_M_1.28`. Matches CS
+  tutorial expectations that the fit is robust.
+- **Castle Doctrine (CS)**: positive sign (homicides went up), pre-trends
+  `clear_violation` (joint p = 0.003), HonestDiD `fragile`
+  (breakdown_M = 0). Matches Cheng-Hoekstra's escalation finding AND
+  correctly flags the identifying-assumption fragility the staggered
+  rollout produces.
+- **Castle Doctrine (SA)**: identical point estimates (as expected —
+  CS and SA are algebraically consistent on this data), same clear PT
+  violation verdict.
+
+No wrong-sign or wrong-verdict findings surfaced on any of the four
+runs. The Bacon "already-robust" framing lifted from round-45 reads
+correctly on the staggered fits (CS and SA on Castle Doctrine and
+mpdta): the caveat is scoped as a statement about the rollout
+design, not a switch-estimator recommendation.
+
+## Issues fixed in this PR
+
+Small prose bugs surfaced by the real-data output. Each is a wording
+fix, not a methodology defect. All three are regression-tested under
+``tests/test_business_report.py::TestCanonicalValidationSurfaceFixes``.
+
+### Issue 1 (FIXED): Treatment label first-word capitalization eats abbreviations
+
+Card-Krueger output:
+
+> The **nj** minimum-wage increase lifted FTE employment by 1.47 FTE …
+
+Castle Doctrine output (CS and SA):
+
+> **Castle doctrine** law adoption worsened Homicide rate (per 100k) …
+
+BR used ``str.capitalize()``, which lowercases every character after
+the first. For labels starting with an abbreviation (``"the NJ
+minimum-wage increase"``) or a proper-noun phrase (``"Castle Doctrine
+law adoption"``), this flattened case in a way that looked wrong in
+stakeholder-facing prose.
+
+**Fix**: replaced ``str.capitalize()`` with a new
+``_sentence_first_upper`` helper that uppercases only the first
+character and preserves user-supplied casing for everything else.
+
+### Issue 2 (FIXED): ``breakdown_M = 0`` phrasing reads as "0x" (zero-times-something)
+
+Castle Doctrine (CS):
+
+> HonestDiD: the result is fragile — the confidence interval
+> includes zero once violations reach **0x** the pre-period
+> variation.
+
+When the breakdown M is exactly 0, "reach 0x the pre-period
+variation" is a degenerate reading — the CI already includes zero
+under any nonzero pre-trend violation (or even with zero violation,
+depending on the grid).
+
+**Fix**: when ``breakdown_M <= 0.05``, both BR's summary and DR's
+overall-interpretation sentence emit "the confidence interval
+includes zero even at the smallest parallel-trends violations on
+the sensitivity grid" instead of quoting the ``0x`` multiplier. The
+0.05 threshold also covers near-zero values (e.g., 0.03) where the
+multiplier is equally uninformative to stakeholders.
+
+### Issue 3 (deferred): Outcome-label capitalization in mid-sentence
+
+mpdta output:
+
+> … reduced **Log employment** by 0.0214 log-points …
+
+The user-supplied ``outcome_label="Log employment"`` is
+capitalized as-is, which looks awkward mid-sentence. This is
+stylistic, and arguably user-controllable (the user could pass
+``outcome_label="log employment"``). Deprioritize unless fixing
+Issue 1 is trivially extensible. Noted here for follow-up.
+
+## Issues to track as follow-up (out-of-scope for this PR)
+
+### Follow-up A: ``SunAbrahamResults`` excluded from HonestDiD applicability
+
+BR's applicability matrix (``diagnostic_report.py`` line ~107) lists
+``SunAbrahamResults`` with ``{parallel_trends, pretrends_power, bacon,
+design_effect, heterogeneity}`` — NO ``sensitivity``. But the
+original plan's applicability matrix in
+``project_br_dr_foundation.md`` and the SA methodology surface
+(event-study coefficients + VCov) both support HonestDiD in principle.
+
+Observed on Castle Doctrine (SA):
+
+> ## Sensitivity (HonestDiD)
+>
+> - Sensitivity not computed: sensitivity is not applicable to
+>   SunAbrahamResults.
+
+Given SA shows the same PT violation that CS does on this dataset,
+not having HonestDiD sensitivity on SA is a real usability gap. This
+requires adding an SA adapter to ``compute_honest_did`` and expanding
+the applicability matrix; it is library work beyond BR/DR prose.
+Belongs in the BR/DR gap-list expansion.
+
+### Follow-up B: Target-parameter clarity (gap list item #6)
+
+The assumption block on every staggered fit still reads:
+
+> Identification relies on parallel trends across treatment cohorts
+> and time periods (group-time ATT), plus no anticipation.
+
+But the CS ``overall_att`` for mpdta is a specific weighted average
+of ``ATT(g, t)`` cells, SA is an IW average, Stacked is a
+sub-experiment-weighted average, dCDH is a switchers average. BR's
+headline reports a single number without disambiguating the
+estimand. For Baker et al. (2025) practitioner-guide parity, the
+assumption block should carry the target-parameter clause.
+Already tracked as gap #6.
+
+### Follow-up C: Card-Krueger effect size differs from published ATT
+
+Our bundled ``load_card_krueger`` returns an ATT of +1.47 FTE; the
+published Card-Krueger ATT is ~+0.59 FTE. The direction and
+CI-includes-zero verdict match canonical, but the magnitude does
+not. This is a ``datasets.py`` data-loading question (the
+causaldata mirror may aggregate differently than the original
+author sample), not a BR prose bug. Noted here so a future
+data-validation PR can address it upstream.
+
+## What was validated (summary)
+
+- End-to-end BR / DR flow runs without errors on 4 canonical datasets.
+- Direction of the effect matches canonical interpretation on all 4.
+- Pre-trends verdict tier (no_detected_violation / clear_violation)
+  matches the literature's reading.
+- HonestDiD sensitivity tier (robust vs fragile) matches.
+- Bacon "already-robust" framing from round-45 reads correctly on
+  real staggered data.
+- The identifying-assumption source-faithfulness retags from
+  round-42 (BJS / Gardner untreated-outcome FE model) did not
+  surface on these runs because none of the datasets was run
+  through ImputationDiD or TwoStageDiD — follow-up validation
+  should add those.
+
+## Regeneration
+
+```bash
+python docs/validation/validate_br_dr_canonical.py
+```
+
+The script writes ``br_dr_canonical_validation.md`` (the raw output
+artifact); this file is the findings synthesis and is written by
+hand from that artifact.
diff --git a/docs/validation/br_dr_canonical_validation.md b/docs/validation/br_dr_canonical_validation.md
new file mode 100644
index 00000000..99bb5af0
--- /dev/null
+++ b/docs/validation/br_dr_canonical_validation.md
@@ -0,0 +1,723 @@
+# BR / DR canonical-dataset validation
+
+Output of ``docs/validation/validate_br_dr_canonical.py``. Each section runs BusinessReport (and its auto-constructed DiagnosticReport) on a canonical DiD dataset and dumps summary + full_report + selected to_dict blocks. The purpose is to compare BR's prose output against published canonical interpretations and record divergences in ``br_dr_canonical_findings.md``.
+
+This file is regenerable; do not hand-edit.
+
+Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA).
+
+---
+
+## Card & Krueger (1994): NJ/PA minimum wage
+Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA (control, $4.25 throughout). Outcome: full-time equivalent employment. N=310 stores.
+
+Canonical interpretation: no significant disemployment effect of the minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). The famous finding was that the CI included zero.
+
+### BusinessReport.summary()
+```
+Question: Did the NJ minimum-wage increase reduce fast-food employment? The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE). Statistically, the confidence interval includes zero; the data are consistent with no effect. Sample: 620 observations (462 treated, 158 control).
+```
+### BusinessReport.full_report()
+```markdown
+# Business Report: FTE employment
+
+**Question**: Did the NJ minimum-wage increase reduce fast-food employment?
+
+**Estimator**: `DiDResults`
+
+## Headline
+
+The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE).
+
+Statistically, the confidence interval includes zero; the data are consistent with no effect.
+
+## Identifying Assumption
+
+Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group.
+
+## Pre-Trends
+
+- Pre-trends not computed: auto_diagnostics=False
+
+## Sensitivity (HonestDiD)
+
+- Sensitivity not computed: auto_diagnostics=False
+
+## Sample
+
+- Observations: 620
+- Treated: 462
+- Control: 158
+
+## References
+
+- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
+- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
+
+
+## Technical Appendix
+
+```
+======================================================================
+             Difference-in-Differences Estimation Results             
+======================================================================
+
+Observations:                    620
+Treated:                         462
+Control:                         158
+R-squared:                    0.0036
+Variance:                            HC1 heteroskedasticity-robust
+
+----------------------------------------------------------------------
+Parameter           Estimate    Std. Err.     t-stat      P>|t|      
+----------------------------------------------------------------------
+ATT                   1.4718       1.9320      0.762     0.4465      
+----------------------------------------------------------------------
+
+95% Confidence Interval: [-2.3224, 5.2660]
+CV (SE/|ATT|):                1.3127
+
+Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
+======================================================================
+```
+```
+### BusinessReport.to_dict() - headline + assumption + caveats
+```json
+{
+  "effect": 1.4718176338428604,
+  "se": 1.9320362599811534,
+  "ci_lower": -2.322358689575049,
+  "ci_upper": 5.26599395726077,
+  "alpha_was_honored": true,
+  "alpha_override_caveat": null,
+  "ci_level": 95,
+  "p_value": 0.4464732839915416,
+  "is_significant": false,
+  "near_significance_threshold": false,
+  "unit": "FTE",
+  "unit_kind": "unknown",
+  "sign": "positive",
+  "breakdown_M": null
+}
+```
+```json
+{
+  "parallel_trends_variant": "unconditional",
+  "no_anticipation": true,
+  "description": "Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group."
+}
+```
+```json
+[]
+```
+
+---
+## Callaway-Sant'Anna benchmark (mpdta)
+Data: simulated county-level panel from R `did` package (Callaway & Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: log employment (`lemp`).
+
+Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) on treated counties; group-specific ATT(g,t) negative across cohorts. See CS (2021) Figures 1-2.
+
+### BusinessReport.summary()
+```
+Question: Did minimum-wage increases reduce county employment? The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients do not reject parallel trends; the test is moderately informative. See the sensitivity analysis below for bounded-violation guarantees. HonestDiD: the result remains significant under parallel-trends violations up to 1.3x the observed pre-period variation. Sample: 2,500 observations (309 treated, 191 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
+```
+### BusinessReport.full_report()
+```markdown
+# Business Report: Log employment
+
+**Question**: Did minimum-wage increases reduce county employment?
+
+**Estimator**: `CallawaySantAnnaResults`
+
+## Headline
+
+The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points).
+
+Statistically, the direction of the effect is strongly supported by the data.
+
+## Identifying Assumption
+
+Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation.
+
+## Pre-Trends
+
+- Verdict: `no_detected_violation` (joint p = 0.482)
+- Power tier: `moderately_powered`
+- Minimum detectable violation (MDV): 0.0105
+- MDV / |ATT|: 0.49
+
+## Sensitivity (HonestDiD)
+
+- Method: `relative_magnitude`
+- Breakdown M: 1.28
+- Conclusion: `robust_to_M_1.28`
+
+## Sample
+
+- Observations: 2,500
+- Treated: 309
+- Control: 191
+
+## Heterogeneity
+
+- Source: `event_study_effects_post`
+- N effects: 4
+- Range: -0.0293 to -0.00305
+- CV: 0.668
+- Sign consistent: True
+
+## Caveats
+
+- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
+- **INFO** — The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes.
+
+## Next Steps
+
+- Define target parameter
+  - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers.
+- State identification assumptions
+  - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions.
+- Compare with alternative estimators (SA, BJS, or Gardner)
+  - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity.
+- Report with and without covariates
+  - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification.
+
+## References
+
+- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics.
+- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
+- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
+
+
+## Technical Appendix
+
+```
+=====================================================================================
+            Callaway-Sant'Anna Staggered Difference-in-Differences Results           
+=====================================================================================
+
+Total observations:                  2500
+Treated units:                        309
+Never-treated units:                  191
+Treatment cohorts:                      3
+Time periods:                           5
+Control group:                 never_treated
+Base period:                    universal
+
+-------------------------------------------------------------------------------------
+                   Overall Average Treatment Effect on the Treated                   
+-------------------------------------------------------------------------------------
+Parameter           Estimate    Std. Err.     t-stat      P>|t|   Sig.
+-------------------------------------------------------------------------------------
+ATT                  -0.0214       0.0019    -11.397     0.0000    ***
+-------------------------------------------------------------------------------------
+
+95% Confidence Interval: [-0.0251, -0.0178]
+CV (SE/|ATT|):                0.0877
+
+-------------------------------------------------------------------------------------
+                            Event Study (Dynamic) Effects                            
+-------------------------------------------------------------------------------------
+Rel. Period         Estimate    Std. Err.     t-stat      P>|t|   Sig.
+-------------------------------------------------------------------------------------
+-4                    0.0023       0.0036      0.627     0.5309       
+-3                   -0.0019       0.0023     -0.810     0.4179       
+-2                   -0.0020       0.0022     -0.875     0.3818       
+-1                    0.0000          nan        nan        nan       
+0                    -0.0293       0.0019    -15.137     0.0000    ***
+1                    -0.0235       0.0023    -10.111     0.0000    ***
+2                    -0.0134       0.0031     -4.373     0.0000    ***
+3                    -0.0031       0.0035     -0.884     0.3767       
+-------------------------------------------------------------------------------------
+
+Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
+=====================================================================================
+```
+```
+### BusinessReport.to_dict() - headline + assumption + caveats
+```json
+{
+  "effect": -0.021448663176265446,
+  "se": 0.0018820025192546833,
+  "ci_lower": -0.025137320332818274,
+  "ci_upper": -0.01776000601971262,
+  "alpha_was_honored": true,
+  "alpha_override_caveat": null,
+  "ci_level": 95,
+  "p_value": 4.341504320370796e-30,
+  "is_significant": true,
+  "near_significance_threshold": false,
+  "unit": "log_points",
+  "unit_kind": "log_points",
+  "sign": "negative",
+  "breakdown_M": 1.2776496410369873
+}
+```
+```json
+{
+  "parallel_trends_variant": "conditional_or_group_time",
+  "no_anticipation": true,
+  "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation."
+}
+```
+```json
+{
+  "status": "computed",
+  "method": "joint_wald_event_study",
+  "joint_p_value": 0.4816505473216015,
+  "verdict": "no_detected_violation",
+  "n_pre_periods": 3,
+  "n_dropped_undefined": null,
+  "reason": null,
+  "df_denom": null,
+  "power_status": "ran",
+  "power_reason": null,
+  "power_tier": "moderately_powered",
+  "mdv": 0.010472079171705551,
+  "mdv_share_of_att": 0.48823924762330606,
+  "power_covariance_source": "diag_fallback_available_full_vcov_unused"
+}
+```
+```json
+{
+  "status": "computed",
+  "method": "relative_magnitude",
+  "breakdown_M": 1.2776496410369873,
+  "conclusion": "robust_to_M_1.28",
+  "grid": [
+    {
+      "M": 0.5,
+      "ci_lower": -0.026608074507223883,
+      "ci_upper": -0.008013136465533054,
+      "bound_lower": -0.022462755203290868,
+      "bound_upper": -0.01215845576946607,
+      "robust_to_zero": true
+    },
+    {
+      "M": 1.0,
+      "ci_lower": -0.03176022422413628,
+      "ci_upper": -0.0028609867486206544,
+      "bound_lower": -0.027614904920203267,
+      "bound_upper": -0.00700630605255367,
+      "robust_to_zero": true
+    },
+    {
+      "M": 1.5,
+      "ci_lower": -0.03691237394104868,
+      "ci_upper": 0.002291162968291743,
+      "bound_lower": -0.03276705463711566,
+      "bound_upper": -0.0018541563356412726,
+      "robust_to_zero": false
+    },
+    {
+      "M": 2.0,
+      "ci_lower": -0.04206452365796108,
+      "ci_upper": 0.007443312685204144,
+      "bound_lower": -0.03791920435402807,
+      "bound_upper": 0.0032979933812711283,
+      "robust_to_zero": false
+    }
+  ]
+}
+```
+```json
+[
+  {
+    "severity": "warning",
+    "topic": "bacon_contamination",
+    "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity."
+  },
+  {
+    "severity": "info",
+    "topic": "unit_policy",
+    "message": "The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes."
+  }
+]
+```
+
+---
+## Cheng & Hoekstra (2013): Castle Doctrine laws
+Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. Outcome: homicide rate per 100k population.
+
+Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in homicide rates in states that adopted Castle Doctrine (no deterrent effect; if anything, an escalation).
+
+### BusinessReport.summary()
+```
+Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.00347); the headline should be treated as tentative pending the sensitivity analysis below. HonestDiD: the result is fragile — the confidence interval includes zero even at the smallest parallel-trends violations on the sensitivity grid. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
+```
+### BusinessReport.full_report()
+```markdown
+# Business Report: Homicide rate (per 100k)
+
+**Question**: Did Castle Doctrine law adoption change state homicide rates?
+
+**Estimator**: `CallawaySantAnnaResults`
+
+## Headline
+
+Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population).
+
+Statistically, the direction of the effect is strongly supported by the data.
+
+## Identifying Assumption
+
+Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation.
+
+## Pre-Trends
+
+- Verdict: `clear_violation` (joint p = 0.00347)
+- Power tier: `underpowered`
+- Minimum detectable violation (MDV): 0.732
+- MDV / |ATT|: 1.3
+
+## Sensitivity (HonestDiD)
+
+- Method: `relative_magnitude`
+- Breakdown M: 0
+- Conclusion: `fragile`
+
+## Sample
+
+- Observations: 539
+- Treated: 22
+- Control: 27
+
+## Heterogeneity
+
+- Source: `event_study_effects_post`
+- N effects: 6
+- Range: 0.237 to 0.764
+- CV: 0.348
+- Sign consistent: True
+
+## Caveats
+
+- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
+- **WARNING** — HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative.
+
+## Next Steps
+
+- Define target parameter
+  - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers.
+- State identification assumptions
+  - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions.
+- Compare with alternative estimators (SA, BJS, or Gardner)
+  - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity.
+- Report with and without covariates
+  - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification.
+
+## References
+
+- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics.
+- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
+- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
+
+
+## Technical Appendix
+
+```
+=====================================================================================
+            Callaway-Sant'Anna Staggered Difference-in-Differences Results           
+=====================================================================================
+
+Total observations:                   539
+Treated units:                         22
+Never-treated units:                   27
+Treatment cohorts:                      6
+Time periods:                          11
+Control group:                 never_treated
+Base period:                    universal
+
+-------------------------------------------------------------------------------------
+                   Overall Average Treatment Effect on the Treated                   
+-------------------------------------------------------------------------------------
+Parameter           Estimate    Std. Err.     t-stat      P>|t|   Sig.
+-------------------------------------------------------------------------------------
+ATT                   0.5608       0.1216      4.613     0.0000    ***
+-------------------------------------------------------------------------------------
+
+95% Confidence Interval: [0.3225, 0.7991]
+CV (SE/|ATT|):                0.2168
+
+-------------------------------------------------------------------------------------
+                            Event Study (Dynamic) Effects                            
+-------------------------------------------------------------------------------------
+Rel. Period         Estimate    Std. Err.     t-stat      P>|t|   Sig.
+-------------------------------------------------------------------------------------
+-10                  -0.3415       0.1462     -2.336     0.0195      *
+-9                    0.3406       0.5526      0.616     0.5377       
+-8                   -0.1465       0.1794     -0.816     0.4143       
+-7                    0.1393       0.3426      0.406     0.6844       
+-6                    0.2611       0.1574      1.659     0.0972      .
+-5                   -0.0466       0.1215     -0.383     0.7015       
+-4                    0.1224       0.1511      0.810     0.4180       
+-3                    0.0783       0.1505      0.520     0.6030       
+-2                    0.1541       0.1085      1.420     0.1555       
+-1                    0.0000          nan        nan        nan       
+0                     0.4453       0.1606      2.772     0.0056     **
+1                     0.7074       0.1957      3.614     0.0003    ***
+2                     0.7642       0.1590      4.807     0.0000    ***
+3                     0.5525       0.1582      3.492     0.0005    ***
+4                     0.2367       0.1789      1.323     0.1859       
+5                     0.6463       0.1227      5.269     0.0000    ***
+-------------------------------------------------------------------------------------
+
+Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
+=====================================================================================
+```
+```
+### BusinessReport.to_dict() - headline + assumption + caveats
+```json
+{
+  "effect": 0.5608256172839505,
+  "se": 0.12157293428086259,
+  "ci_lower": 0.32254704459860495,
+  "ci_upper": 0.7991041899692961,
+  "alpha_was_honored": true,
+  "alpha_override_caveat": null,
+  "ci_level": 95,
+  "p_value": 3.967463629059167e-06,
+  "is_significant": true,
+  "near_significance_threshold": false,
+  "unit": "per 100k population",
+  "unit_kind": "unknown",
+  "sign": "positive",
+  "breakdown_M": 0.0
+}
+```
+```json
+{
+  "parallel_trends_variant": "conditional_or_group_time",
+  "no_anticipation": true,
+  "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation."
+}
+```
+```json
+{
+  "status": "computed",
+  "method": "joint_wald_event_study",
+  "joint_p_value": 0.003469090217576798,
+  "verdict": "clear_violation",
+  "n_pre_periods": 9,
+  "n_dropped_undefined": null,
+  "reason": null,
+  "df_denom": null,
+  "power_status": "ran",
+  "power_reason": null,
+  "power_tier": "underpowered",
+  "mdv": 0.7318611799799601,
+  "mdv_share_of_att": 1.3049710238350487,
+  "power_covariance_source": "diag_fallback_available_full_vcov_unused"
+}
+```
+```json
+{
+  "status": "computed",
+  "method": "relative_magnitude",
+  "breakdown_M": 0.0,
+  "conclusion": "fragile",
+  "grid": [
+    {
+      "M": 0.5,
+      "ci_lower": -0.84457211437162,
+      "ci_upper": 1.9620045961559538,
+      "bound_lower": -0.6348485739226502,
+      "bound_upper": 1.752281055706984,
+      "robust_to_zero": false
+    },
+    {
+      "M": 1.0,
+      "ci_lower": -2.038136929186437,
+      "ci_upper": 3.1555694109707706,
+      "bound_lower": -1.8284133887374672,
+      "bound_upper": 2.9458458705218007,
+      "robust_to_zero": false
+    },
+    {
+      "M": 1.5,
+      "ci_lower": -3.231701744001254,
+      "ci_upper": 4.349134225785587,
+      "bound_lower": -3.021978203552284,
+      "bound_upper": 4.1394106853366175,
+      "robust_to_zero": false
+    },
+    {
+      "M": 2.0,
+      "ci_lower": -4.4252665588160705,
+      "ci_upper": 5.542699040600405,
+      "bound_lower": -4.215543018367101,
+      "bound_upper": 5.332975500151435,
+      "robust_to_zero": false
+    }
+  ]
+}
+```
+```json
+[
+  {
+    "severity": "warning",
+    "topic": "bacon_contamination",
+    "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity."
+  },
+  {
+    "severity": "warning",
+    "topic": "sensitivity_fragility",
+    "message": "HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative."
+  }
+]
+```
+
+---
+## Castle Doctrine under Sun-Abraham (2021)
+Same dataset and research question; different estimator. Testing BR/DR cross-estimator narrative consistency.
+
+### BusinessReport.summary()
+```
+Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.0128); the headline should be treated as tentative. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
+```
+### BusinessReport.full_report()
+```markdown
+# Business Report: Homicide rate (per 100k)
+
+**Question**: Did Castle Doctrine law adoption change state homicide rates?
+
+**Estimator**: `SunAbrahamResults`
+
+## Headline
+
+Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population).
+
+Statistically, the direction of the effect is strongly supported by the data.
+
+## Identifying Assumption
+
+Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation.
+
+## Pre-Trends
+
+- Verdict: `clear_violation` (joint p = 0.0128)
+- Power tier: `moderately_powered`
+- Minimum detectable violation (MDV): 0.551
+- MDV / |ATT|: 0.98
+
+## Sensitivity (HonestDiD)
+
+- Sensitivity not computed: sensitivity is not applicable to SunAbrahamResults.
+
+## Sample
+
+- Observations: 539
+- Treated: 22
+- Control: 27
+
+## Heterogeneity
+
+- Source: `event_study_effects_post`
+- N effects: 6
+- Range: 0.237 to 0.764
+- CV: 0.348
+- Sign consistent: True
+
+## Caveats
+
+- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
+
+## Next Steps
+
+- Define target parameter
+  - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers.
+- State identification assumptions
+  - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions.
+- Specification-based falsification
+  - _why_: Compare results across control group definitions (never_treated vs not_yet_treated) and anticipation settings to assess robustness.
+- Compare with alternative estimators (CS, BJS, or Gardner)
+  - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity.
+- Report with and without covariates
+  - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification.
+
+## References
+
+- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies. Journal of Econometrics.
+- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
+- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
+
+
+## Technical Appendix
+
+```
+=====================================================================================
+                  Sun-Abraham Interaction-Weighted Estimator Results                 
+=====================================================================================
+
+Total observations:                   539
+Treated units:                         22
+Control units:                         27
+Treatment cohorts:                      6
+Time periods:                          11
+Control group:                 never_treated
+
+-------------------------------------------------------------------------------------
+                   Overall Average Treatment Effect on the Treated                   
+-------------------------------------------------------------------------------------
+Parameter           Estimate    Std. Err.     t-stat      P>|t|   Sig.
+-------------------------------------------------------------------------------------
+ATT                   0.5608       0.1208      4.642     0.0000    ***
+-------------------------------------------------------------------------------------
+
+95% Confidence Interval: [0.3240, 0.7976]
+CV (SE/|ATT|):                0.2154
+
+-------------------------------------------------------------------------------------
+                            Event Study (Dynamic) Effects                            
+-------------------------------------------------------------------------------------
+Rel. Period         Estimate    Std. Err.     t-stat      P>|t|   Sig.
+-------------------------------------------------------------------------------------
+-10                  -0.3415       0.1566     -2.181     0.0292      *
+-9                    0.3406       0.1067      3.191     0.0014     **
+-8                   -0.1465       0.1379     -1.062     0.2882       
+-7                    0.1393       0.3326      0.419     0.6754       
+-6                    0.2611       0.1646      1.586     0.1128       
+-5                   -0.0466       0.1181     -0.394     0.6933       
+-4                    0.1224       0.1344      0.911     0.3625       
+-3                    0.0783       0.1576      0.497     0.6194       
+-2                    0.1541       0.0957      1.610     0.1075       
+0                     0.4453       0.1627      2.737     0.0062     **
+1                     0.7074       0.1903      3.716     0.0002    ***
+2                     0.7642       0.1612      4.739     0.0000    ***
+3                     0.5525       0.1646      3.357     0.0008    ***
+4                     0.2367       0.1909      1.240     0.2150       
+5                     0.6463       0.1313      4.921     0.0000    ***
+-------------------------------------------------------------------------------------
+
+Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
+=====================================================================================
+```
+```
+### BusinessReport.to_dict() - headline + assumption
+```json
+{
+  "effect": 0.5608256172839505,
+  "se": 0.12081241968043965,
+  "ci_lower": 0.3240376258251508,
+  "ci_upper": 0.7976136087427503,
+  "alpha_was_honored": true,
+  "alpha_override_caveat": null,
+  "ci_level": 95,
+  "p_value": 3.448543002483855e-06,
+  "is_significant": true,
+  "near_significance_threshold": false,
+  "unit": "per 100k population",
+  "unit_kind": "unknown",
+  "sign": "positive",
+  "breakdown_M": null
+}
+```
+```json
+{
+  "parallel_trends_variant": "conditional_or_group_time",
+  "no_anticipation": true,
+  "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation."
+}
+```
+
+---
diff --git a/docs/validation/validate_br_dr_canonical.py b/docs/validation/validate_br_dr_canonical.py
new file mode 100644
index 00000000..cc411551
--- /dev/null
+++ b/docs/validation/validate_br_dr_canonical.py
@@ -0,0 +1,308 @@
+"""Run BusinessReport / DiagnosticReport on canonical DiD datasets.
+
+Writes ``docs/validation/br_dr_canonical_validation.md`` with the
+full BR ``summary()`` + ``full_report()`` + selected ``to_dict()``
+blocks for each dataset. The markdown output is the reviewable
+artifact; compare it against canonical literature interpretations
+and record any divergences in
+``docs/validation/br_dr_canonical_findings.md``.
+
+Purpose: BR/DR gap #4 (real-dataset validation) — synthetic-DGP
+tests pass but we haven't checked whether the prose output matches
+canonical interpretations of applied work.
+
+Run via: ``python docs/validation/validate_br_dr_canonical.py``.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+import warnings
+from pathlib import Path
+
+import numpy as np
+
+from diff_diff import (
+    BusinessReport,
+    CallawaySantAnna,
+    DifferenceInDifferences,
+    SunAbraham,
+)
+from diff_diff.datasets import (
+    load_card_krueger,
+    load_castle_doctrine,
+    load_mpdta,
+)
+
+OUT_PATH = Path(__file__).parent / "br_dr_canonical_validation.md"
+
+
+def _section(title: str, level: int = 2) -> str:
+    return "#" * level + " " + title + "\n"
+
+
+def _fence(body: str, lang: str = "") -> str:
+    return f"```{lang}\n{body.rstrip()}\n```\n"
+
+
+def _dump_block(name: str, block: dict) -> str:
+    return _fence(json.dumps(block, indent=2, default=str), "json")
+
+
+def _card_krueger_section() -> str:
+    """Card & Krueger (1994) minimum wage — classic 2x2 DiD.
+
+    Canonical finding: no significant negative effect of NJ minimum-wage
+    increase on fast-food employment; published ATT ~ +2.8 FTE or
+    approximately 0.6 FTE per store depending on specification. CI
+    includes zero; direction positive.
+    """
+    parts = [_section("Card & Krueger (1994): NJ/PA minimum wage", 2)]
+    ck = load_card_krueger()
+    # Reshape wide -> long per the docstring example.
+    ck_long = ck.melt(
+        id_vars=["store_id", "state", "treated"],
+        value_vars=["emp_pre", "emp_post"],
+        var_name="period",
+        value_name="employment",
+    )
+    ck_long["post"] = (ck_long["period"] == "emp_post").astype(int)
+    did = DifferenceInDifferences()
+    fit = did.fit(ck_long, outcome="employment", treatment="treated", time="post")
+    br = BusinessReport(
+        fit,
+        outcome_label="FTE employment",
+        outcome_unit="FTE",
+        outcome_direction="higher_is_better",
+        business_question="Did the NJ minimum-wage increase reduce fast-food employment?",
+        treatment_label="the NJ minimum-wage increase",
+        auto_diagnostics=False,  # 2x2 PT needs manual column kwargs; run without for now
+    )
+    parts.append(
+        "Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA "
+        "(control, $4.25 throughout). Outcome: full-time equivalent employment. "
+        f"N={len(ck)} stores.\n\n"
+    )
+    parts.append(
+        "Canonical interpretation: no significant disemployment effect of the "
+        "minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). "
+        "The famous finding was that the CI included zero.\n\n"
+    )
+    parts.append(_section("BusinessReport.summary()", 3))
+    parts.append(_fence(br.summary()))
+    parts.append(_section("BusinessReport.full_report()", 3))
+    parts.append(_fence(br.full_report(), "markdown"))
+    parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3))
+    d = br.to_dict()
+    parts.append(_dump_block("headline", d.get("headline", {})))
+    parts.append(_dump_block("assumption", d.get("assumption", {})))
+    parts.append(_dump_block("caveats", d.get("caveats", [])))
+    parts.append("\n---\n")
+    return "".join(parts)
+
+
+def _mpdta_section() -> str:
+    """Callaway-Sant'Anna benchmark (mpdta): county-level log employment
+    under staggered minimum-wage increases.
+
+    Canonical finding: CS aggregate ATT roughly -0.04 to -0.05 on log
+    employment (i.e., ~4-5% employment decline for treated counties).
+    Group-level ATT(g,t) shown in CS Figure 1.
+    """
+    parts = [_section("Callaway-Sant'Anna benchmark (mpdta)", 2)]
+    df = load_mpdta()
+    cs = CallawaySantAnna(base_period="universal")
+    fit = cs.fit(
+        df,
+        outcome="lemp",
+        unit="countyreal",
+        time="year",
+        first_treat="first_treat",
+        aggregate="event_study",
+    )
+    br = BusinessReport(
+        fit,
+        outcome_label="Log employment",
+        outcome_unit="log_points",
+        outcome_direction="higher_is_better",
+        business_question="Did minimum-wage increases reduce county employment?",
+        treatment_label="the state-level minimum wage increase",
+        data=df,
+        outcome="lemp",
+        unit="countyreal",
+        time="year",
+        first_treat="first_treat",
+    )
+    parts.append(
+        "Data: simulated county-level panel from R `did` package (Callaway & "
+        "Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: "
+        "log employment (`lemp`).\n\n"
+    )
+    parts.append(
+        "Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) "
+        "on treated counties; group-specific ATT(g,t) negative across cohorts. "
+        "See CS (2021) Figures 1-2.\n\n"
+    )
+    parts.append(_section("BusinessReport.summary()", 3))
+    parts.append(_fence(br.summary()))
+    parts.append(_section("BusinessReport.full_report()", 3))
+    parts.append(_fence(br.full_report(), "markdown"))
+    parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3))
+    d = br.to_dict()
+    parts.append(_dump_block("headline", d.get("headline", {})))
+    parts.append(_dump_block("assumption", d.get("assumption", {})))
+    parts.append(_dump_block("pre_trends", d.get("pre_trends", {})))
+    parts.append(_dump_block("sensitivity", d.get("sensitivity", {})))
+    parts.append(_dump_block("caveats", d.get("caveats", [])))
+    parts.append("\n---\n")
+    return "".join(parts)
+
+
+def _castle_doctrine_section() -> str:
+    """Cheng & Hoekstra (2013): staggered adoption of Castle Doctrine laws.
+
+    Canonical finding: ~8% increase in homicide rates in adopting
+    states; no deterrent effect on burglary or other crimes.
+    """
+    parts = [_section("Cheng & Hoekstra (2013): Castle Doctrine laws", 2)]
+    df = load_castle_doctrine()
+    # CS with never-treated as control; outcome = homicide rate.
+    cs = CallawaySantAnna(base_period="universal", control_group="never_treated")
+    fit = cs.fit(
+        df,
+        outcome="homicide_rate",
+        unit="state",
+        time="year",
+        first_treat="first_treat",
+        aggregate="event_study",
+    )
+    br = BusinessReport(
+        fit,
+        outcome_label="Homicide rate (per 100k)",
+        outcome_unit="per 100k population",
+        outcome_direction="lower_is_better",
+        business_question=(
+            "Did Castle Doctrine law adoption change state homicide rates?"
+        ),
+        treatment_label="Castle Doctrine law adoption",
+        data=df,
+        outcome="homicide_rate",
+        unit="state",
+        time="year",
+        first_treat="first_treat",
+    )
+    parts.append(
+        "Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. "
+        "Outcome: homicide rate per 100k population.\n\n"
+    )
+    parts.append(
+        "Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in "
+        "homicide rates in states that adopted Castle Doctrine (no deterrent "
+        "effect; if anything, an escalation).\n\n"
+    )
+    parts.append(_section("BusinessReport.summary()", 3))
+    parts.append(_fence(br.summary()))
+    parts.append(_section("BusinessReport.full_report()", 3))
+    parts.append(_fence(br.full_report(), "markdown"))
+    parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3))
+    d = br.to_dict()
+    parts.append(_dump_block("headline", d.get("headline", {})))
+    parts.append(_dump_block("assumption", d.get("assumption", {})))
+    parts.append(_dump_block("pre_trends", d.get("pre_trends", {})))
+    parts.append(_dump_block("sensitivity", d.get("sensitivity", {})))
+    parts.append(_dump_block("caveats", d.get("caveats", [])))
+    parts.append("\n---\n")
+    return "".join(parts)
+
+
+def _castle_doctrine_sun_abraham_section() -> str:
+    """Same Castle Doctrine dataset but run through Sun-Abraham, as a
+    cross-estimator consistency check. If SA and CS narrate the same
+    canonical finding differently, that's a BR/DR source-faithfulness
+    issue.
+    """
+    parts = [_section("Castle Doctrine under Sun-Abraham (2021)", 2)]
+    df = load_castle_doctrine()
+    sa = SunAbraham()
+    fit = sa.fit(
+        df,
+        outcome="homicide_rate",
+        unit="state",
+        time="year",
+        first_treat="first_treat",
+    )
+    br = BusinessReport(
+        fit,
+        outcome_label="Homicide rate (per 100k)",
+        outcome_unit="per 100k population",
+        outcome_direction="lower_is_better",
+        business_question=(
+            "Did Castle Doctrine law adoption change state homicide rates?"
+        ),
+        treatment_label="Castle Doctrine law adoption",
+        data=df,
+        outcome="homicide_rate",
+        unit="state",
+        time="year",
+        first_treat="first_treat",
+    )
+    parts.append(
+        "Same dataset and research question; different estimator. Testing BR/DR "
+        "cross-estimator narrative consistency.\n\n"
+    )
+    parts.append(_section("BusinessReport.summary()", 3))
+    parts.append(_fence(br.summary()))
+    parts.append(_section("BusinessReport.full_report()", 3))
+    parts.append(_fence(br.full_report(), "markdown"))
+    parts.append(_section("BusinessReport.to_dict() - headline + assumption", 3))
+    d = br.to_dict()
+    parts.append(_dump_block("headline", d.get("headline", {})))
+    parts.append(_dump_block("assumption", d.get("assumption", {})))
+    parts.append("\n---\n")
+    return "".join(parts)
+
+
+def main() -> int:
+    warnings.filterwarnings("ignore")
+    np.random.seed(42)
+
+    header = (
+        "# BR / DR canonical-dataset validation\n\n"
+        "Output of ``docs/validation/validate_br_dr_canonical.py``. Each section "
+        "runs BusinessReport (and its auto-constructed DiagnosticReport) on a "
+        "canonical DiD dataset and dumps summary + full_report + selected "
+        "to_dict blocks. The purpose is to compare BR's prose output against "
+        "published canonical interpretations and record divergences in "
+        "``br_dr_canonical_findings.md``.\n\n"
+        "This file is regenerable; do not hand-edit.\n\n"
+        "Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 "
+        "benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA).\n\n"
+        "---\n\n"
+    )
+
+    sections = [header]
+    for name, fn in (
+        ("card_krueger", _card_krueger_section),
+        ("mpdta", _mpdta_section),
+        ("castle_doctrine_cs", _castle_doctrine_section),
+        ("castle_doctrine_sa", _castle_doctrine_sun_abraham_section),
+    ):
+        print(f"Running {name} ...", file=sys.stderr)
+        try:
+            sections.append(fn())
+        except Exception as exc:  # noqa: BLE001
+            sections.append(
+                _section(f"{name} (ERROR)", 2)
+                + _fence(f"{type(exc).__name__}: {exc}")
+                + "\n---\n"
+            )
+            print(f"  {type(exc).__name__}: {exc}", file=sys.stderr)
+
+    OUT_PATH.write_text("".join(sections))
+    print(f"Wrote {OUT_PATH}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_business_report.py b/tests/test_business_report.py
index cda3e5a7..77e059e3 100644
--- a/tests/test_business_report.py
+++ b/tests/test_business_report.py
@@ -3975,6 +3975,174 @@ def test_br_rejects_both_passthrough_inputs_names_them(self):
         assert "precomputed['sensitivity']" in msg
 
 
+class TestCanonicalValidationSurfaceFixes:
+    """Regression coverage for issues surfaced by the first round of
+    BR/DR canonical-dataset validation (``docs/validation/
+    validate_br_dr_canonical.py``). Each test pins a wording bug
+    observed on a real published-applied-work fit.
+    """
+
+    def _cs_like_stub_with_zero_breakdown(self):
+        """CS-style result stub matching the Cheng-Hoekstra Castle
+        Doctrine fit pattern."""
+
+        class CallawaySantAnnaResults:
+            pass
+
+        stub = CallawaySantAnnaResults()
+        stub.overall_att = 0.5608
+        stub.overall_se = 0.1216
+        stub.overall_p_value = 0.0
+        stub.overall_conf_int = (0.323, 0.799)
+        stub.alpha = 0.05
+        stub.n_obs = 539
+        stub.n_treated = 22
+        stub.n_control_units = 27
+        stub.survey_metadata = None
+        stub.event_study_effects = None
+        stub.base_period = "universal"
+        stub.inference_method = "analytical"
+        return stub
+
+    def _fragile_dr_schema(self, breakdown_m: float):
+        """Build a fake DiagnosticReportResults whose ``sensitivity``
+        block carries the given ``breakdown_M`` value."""
+        from diff_diff.diagnostic_report import DiagnosticReportResults
+
+        schema = {
+            "schema_version": "1.0",
+            "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"},
+            "headline_metric": {},
+            "parallel_trends": {"status": "skipped", "reason": "stub"},
+            "pretrends_power": {"status": "skipped", "reason": "stub"},
+            "sensitivity": {
+                "status": "ran",
+                "method": "relative_magnitude",
+                "breakdown_M": breakdown_m,
+                "conclusion": "fragile",
+                "grid": [],
+            },
+            "placebo": {"status": "skipped", "reason": "stub"},
+            "bacon": {"status": "skipped", "reason": "stub"},
+            "design_effect": {"status": "skipped", "reason": "stub"},
+            "heterogeneity": {"status": "skipped", "reason": "stub"},
+            "epv": {"status": "skipped", "reason": "stub"},
+            "estimator_native_diagnostics": {"status": "not_applicable"},
+            "skipped": {},
+            "warnings": [],
+            "overall_interpretation": "",
+            "next_steps": [],
+        }
+        return DiagnosticReportResults(
+            schema=schema,
+            interpretation="",
+            applicable_checks=("sensitivity",),
+            skipped_checks={},
+            warnings=(),
+        )
+
+    def test_treatment_label_preserves_embedded_abbreviations(self):
+        """Card-Krueger use case: ``treatment_label="the NJ minimum-wage
+        increase"`` previously rendered as ``"The nj minimum-wage
+        increase"`` because ``str.capitalize()`` lowercases every
+        character after the first. The fix preserves user-supplied
+        casing and only uppercases the first character.
+        """
+
+        class DiDResults:
+            pass
+
+        stub = DiDResults()
+        stub.att = 1.47
+        stub.se = 1.93
+        stub.t_stat = 0.76
+        stub.p_value = 0.45
+        stub.conf_int = (-2.32, 5.27)
+        stub.alpha = 0.05
+        stub.n_obs = 620
+        stub.n_treated = 462
+        stub.n_control = 158
+        stub.survey_metadata = None
+        stub.inference_method = "analytical"
+        br = BusinessReport(
+            stub,
+            outcome_label="FTE employment",
+            treatment_label="the NJ minimum-wage increase",
+            auto_diagnostics=False,
+        )
+        headline = br.headline()
+        assert "The NJ minimum-wage increase" in headline, (
+            "Embedded ``NJ`` abbreviation must survive the first-word "
+            f"capitalization. Got headline: {headline!r}"
+        )
+        assert "The nj" not in headline, (
+            "Previous capitalize() bug lowercased the NJ abbreviation. " f"Got: {headline!r}"
+        )
+
+    def test_treatment_label_preserves_proper_noun_case(self):
+        """Castle Doctrine use case: ``treatment_label="Castle Doctrine
+        law adoption"`` previously rendered as ``"Castle doctrine law
+        adoption"`` because capitalize() lowercased the rest. Must
+        preserve proper-noun casing.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        br = BusinessReport(
+            stub,
+            outcome_label="Homicide rate",
+            treatment_label="Castle Doctrine law adoption",
+            auto_diagnostics=False,
+        )
+        headline = br.headline()
+        assert (
+            "Castle Doctrine law adoption" in headline
+        ), f"Proper-noun casing must be preserved. Got: {headline!r}"
+
+    def test_breakdown_m_zero_uses_smallest_grid_point_wording(self):
+        """Cheng-Hoekstra Castle Doctrine produces ``breakdown_M == 0``
+        under HonestDiD. The old wording "violations reach 0x the
+        pre-period variation" reads as a degenerate zero-times-variation
+        sentence. The fix switches to "includes zero even at the
+        smallest parallel-trends violations on the sensitivity grid"
+        for breakdown values at or near zero.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        dr = self._fragile_dr_schema(breakdown_m=0.0)
+        br = BusinessReport(stub, diagnostics=dr)
+        summary = br.summary()
+        assert "0x" not in summary, (
+            f"Summary must not render ``0x the pre-period variation``; "
+            f"that reads as zero-times-anything. Got: {summary!r}"
+        )
+        assert "smallest parallel-trends violations" in summary, (
+            f"Summary must use the smallest-grid-point wording at "
+            f"breakdown_M == 0. Got: {summary!r}"
+        )
+
+    def test_breakdown_m_small_positive_still_uses_smallest_grid_point_wording(self):
+        """Breakdown values just above zero (e.g., 0.03) should also
+        route through the smallest-grid-point wording — quoting
+        ``0.03x`` to a stakeholder is equally uninformative.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        dr = self._fragile_dr_schema(breakdown_m=0.03)
+        br = BusinessReport(stub, diagnostics=dr)
+        summary = br.summary()
+        assert "smallest parallel-trends violations" in summary
+        assert "0.03x" not in summary
+
+    def test_breakdown_m_normal_keeps_multiplier_wording(self):
+        """Breakdown values at the usual fragile-but-nonzero range
+        (e.g., 0.3) must still quote the ``0.3x`` multiplier — the
+        smallest-grid-point wording is only for the degenerate tail.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        dr = self._fragile_dr_schema(breakdown_m=0.3)
+        br = BusinessReport(stub, diagnostics=dr)
+        summary = br.summary()
+        assert "0.3x" in summary
+        assert "smallest parallel-trends violations" not in summary
+
+
 class TestBaconCaveatEstimatorAware:
     """Round-45 P1 CI review on PR #318: Goodman-Bacon decomposes TWFE
     weights. On fits already produced by a heterogeneity-robust

From 1818503fe18fa5be9a61122dafab65cb910f90b8 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sun, 19 Apr 2026 19:45:19 -0400
Subject: [PATCH 2/3] Restructure canonical validation: replace one-shot script
 with regression tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per review: the validation script + findings doc were one-shot
artifacts that would age poorly. Replace them with
``tests/test_br_dr_canonical_datasets.py`` — pytest regression
guards that assert canonical properties (direction, PT verdict tier,
HonestDiD breakdown_M tier, cross-estimator consistency) on each
canonical fit.

Uses the ``_construct_*`` fallback data from ``diff_diff.datasets``
so tests have no network dependency (same pattern
``test_datasets.py`` already uses).

Tests cover:

- Card-Krueger (1994): positive sign, CI includes zero, "consistent
  with no effect" prose.
- mpdta (CS 2021 benchmark): negative ATT, breakdown_M > 1.0,
  no_detected_violation pre-trends.
- Castle Doctrine (Cheng-Hoekstra 2013) under CS: positive ATT,
  clear_violation pre-trends, fragile sensitivity (breakdown_M < 0.5).
- Castle Doctrine cross-estimator consistency: SA agrees with CS on
  direction and PT verdict bin.
- Treatment-label capitalization bugs: ``NJ`` abbreviation and
  ``Castle Doctrine`` proper noun preserved through BR's sentence
  capitalization.
- ``breakdown_M == 0`` edge case: BR summary uses smallest-grid-point
  wording, not the degenerate ``0x`` multiplier.

Drops:

- ``docs/validation/validate_br_dr_canonical.py`` — one-shot script,
  replaced by the regression tests.
- ``docs/validation/br_dr_canonical_validation.md`` — raw dump,
  regenerable on demand if needed but not checked in.
- ``docs/validation/br_dr_canonical_findings.md`` — summary now
  lives in the regression-test docstrings.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/validation/br_dr_canonical_findings.md   | 175 -----
 docs/validation/br_dr_canonical_validation.md | 723 ------------------
 docs/validation/validate_br_dr_canonical.py   | 308 --------
 tests/test_br_dr_canonical_datasets.py        | 356 +++++++++
 4 files changed, 356 insertions(+), 1206 deletions(-)
 delete mode 100644 docs/validation/br_dr_canonical_findings.md
 delete mode 100644 docs/validation/br_dr_canonical_validation.md
 delete mode 100644 docs/validation/validate_br_dr_canonical.py
 create mode 100644 tests/test_br_dr_canonical_datasets.py

diff --git a/docs/validation/br_dr_canonical_findings.md b/docs/validation/br_dr_canonical_findings.md
deleted file mode 100644
index 7f369573..00000000
--- a/docs/validation/br_dr_canonical_findings.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# BR / DR canonical-dataset validation findings
-
-This file records divergences observed in
-``br_dr_canonical_validation.md`` against canonical literature
-interpretations. Generated by running
-``docs/validation/validate_br_dr_canonical.py`` on the bundled
-datasets (Card-Krueger 1994, Callaway-Sant'Anna mpdta benchmark,
-Castle Doctrine / Cheng-Hoekstra 2013). This closes BR/DR
-foundation gap #4 — real-dataset validation — from the
-external-positioning gap list in
-``project_br_dr_foundation.md``.
-
-The goal of the validation exercise is to stress-test BR's prose on
-fits that published applied work has already interpreted, not to
-exactly reproduce their point estimates (the bundled datasets are
-either the R `did` package simulated benchmark or the causaldata
-mirrors, which may differ from the original author data).
-
-## Headline assessment
-
-BR's prose direction, verdicts, and caveat framing match canonical
-interpretations across all four runs:
-
-- **Card-Krueger**: positive sign, CI includes zero, "data consistent
-  with no effect." Matches the famous Card-Krueger finding of no
-  disemployment.
-- **mpdta (CS)**: aggregate ATT negative (-0.021 log-points), pre-trends
-  `no_detected_violation`, HonestDiD `robust_to_M_1.28`. Matches CS
-  tutorial expectations that the fit is robust.
-- **Castle Doctrine (CS)**: positive sign (homicides went up), pre-trends
-  `clear_violation` (joint p = 0.003), HonestDiD `fragile`
-  (breakdown_M = 0). Matches Cheng-Hoekstra's escalation finding AND
-  correctly flags the identifying-assumption fragility the staggered
-  rollout produces.
-- **Castle Doctrine (SA)**: identical point estimates (as expected —
-  CS and SA are algebraically consistent on this data), same clear PT
-  violation verdict.
-
-No wrong-sign or wrong-verdict findings surfaced on any of the four
-runs. The Bacon "already-robust" framing lifted from round-45 reads
-correctly on the staggered fits (CS and SA on Castle Doctrine and
-mpdta): the caveat is scoped as a statement about the rollout
-design, not a switch-estimator recommendation.
-
-## Issues fixed in this PR
-
-Small prose bugs surfaced by the real-data output. Each is a wording
-fix, not a methodology defect. All three are regression-tested under
-``tests/test_business_report.py::TestCanonicalValidationSurfaceFixes``.
-
-### Issue 1 (FIXED): Treatment label first-word capitalization eats abbreviations
-
-Card-Krueger output:
-
-> The **nj** minimum-wage increase lifted FTE employment by 1.47 FTE …
-
-Castle Doctrine output (CS and SA):
-
-> **Castle doctrine** law adoption worsened Homicide rate (per 100k) …
-
-BR used ``str.capitalize()``, which lowercases every character after
-the first. For labels starting with an abbreviation (``"the NJ
-minimum-wage increase"``) or a proper-noun phrase (``"Castle Doctrine
-law adoption"``), this flattened case in a way that looked wrong in
-stakeholder-facing prose.
-
-**Fix**: replaced ``str.capitalize()`` with a new
-``_sentence_first_upper`` helper that uppercases only the first
-character and preserves user-supplied casing for everything else.
-
-### Issue 2 (FIXED): ``breakdown_M = 0`` phrasing reads as "0x" (zero-times-something)
-
-Castle Doctrine (CS):
-
-> HonestDiD: the result is fragile — the confidence interval
-> includes zero once violations reach **0x** the pre-period
-> variation.
-
-When the breakdown M is exactly 0, "reach 0x the pre-period
-variation" is a degenerate reading — the CI already includes zero
-under any nonzero pre-trend violation (or even with zero violation,
-depending on the grid).
-
-**Fix**: when ``breakdown_M <= 0.05``, both BR's summary and DR's
-overall-interpretation sentence emit "the confidence interval
-includes zero even at the smallest parallel-trends violations on
-the sensitivity grid" instead of quoting the ``0x`` multiplier. The
-0.05 threshold also covers near-zero values (e.g., 0.03) where the
-multiplier is equally uninformative to stakeholders.
-
-### Issue 3 (deferred): Outcome-label capitalization in mid-sentence
-
-mpdta output:
-
-> … reduced **Log employment** by 0.0214 log-points …
-
-The user-supplied ``outcome_label="Log employment"`` is
-capitalized as-is, which looks awkward mid-sentence. This is
-stylistic, and arguably user-controllable (the user could pass
-``outcome_label="log employment"``). Deprioritize unless fixing
-Issue 1 is trivially extensible. Noted here for follow-up.
-
-## Issues to track as follow-up (out-of-scope for this PR)
-
-### Follow-up A: ``SunAbrahamResults`` excluded from HonestDiD applicability
-
-BR's applicability matrix (``diagnostic_report.py`` line ~107) lists
-``SunAbrahamResults`` with ``{parallel_trends, pretrends_power, bacon,
-design_effect, heterogeneity}`` — NO ``sensitivity``. But the
-original plan's applicability matrix in
-``project_br_dr_foundation.md`` and the SA methodology surface
-(event-study coefficients + VCov) both support HonestDiD in principle.
-
-Observed on Castle Doctrine (SA):
-
-> ## Sensitivity (HonestDiD)
->
-> - Sensitivity not computed: sensitivity is not applicable to
->   SunAbrahamResults.
-
-Given SA shows the same PT violation that CS does on this dataset,
-not having HonestDiD sensitivity on SA is a real usability gap. This
-requires adding an SA adapter to ``compute_honest_did`` and expanding
-the applicability matrix; it is library work beyond BR/DR prose.
-Belongs in the BR/DR gap-list expansion.
-
-### Follow-up B: Target-parameter clarity (gap list item #6)
-
-The assumption block on every staggered fit still reads:
-
-> Identification relies on parallel trends across treatment cohorts
-> and time periods (group-time ATT), plus no anticipation.
-
-But the CS ``overall_att`` for mpdta is a specific weighted average
-of ``ATT(g, t)`` cells, SA is an IW average, Stacked is a
-sub-experiment-weighted average, dCDH is a switchers average. BR's
-headline reports a single number without disambiguating the
-estimand. For Baker et al. (2025) practitioner-guide parity, the
-assumption block should carry the target-parameter clause.
-Already tracked as gap #6.
-
-### Follow-up C: Card-Krueger effect size differs from published ATT
-
-Our bundled ``load_card_krueger`` returns an ATT of +1.47 FTE; the
-published Card-Krueger ATT is ~+0.59 FTE. The direction and
-CI-includes-zero verdict match canonical, but the magnitude does
-not. This is a ``datasets.py`` data-loading question (the
-causaldata mirror may aggregate differently than the original
-author sample), not a BR prose bug. Noted here so a future
-data-validation PR can address it upstream.
-
-## What was validated (summary)
-
-- End-to-end BR / DR flow runs without errors on 4 canonical datasets.
-- Direction of the effect matches canonical interpretation on all 4.
-- Pre-trends verdict tier (no_detected_violation / clear_violation)
-  matches the literature's reading.
-- HonestDiD sensitivity tier (robust vs fragile) matches.
-- Bacon "already-robust" framing from round-45 reads correctly on
-  real staggered data.
-- The identifying-assumption source-faithfulness retags from
-  round-42 (BJS / Gardner untreated-outcome FE model) did not
-  surface on these runs because none of the datasets was run
-  through ImputationDiD or TwoStageDiD — follow-up validation
-  should add those.
-
-## Regeneration
-
-```bash
-python docs/validation/validate_br_dr_canonical.py
-```
-
-The script writes ``br_dr_canonical_validation.md`` (the raw output
-artifact); this file is the findings synthesis and is written by
-hand from that artifact.
diff --git a/docs/validation/br_dr_canonical_validation.md b/docs/validation/br_dr_canonical_validation.md
deleted file mode 100644
index 99bb5af0..00000000
--- a/docs/validation/br_dr_canonical_validation.md
+++ /dev/null
@@ -1,723 +0,0 @@
-# BR / DR canonical-dataset validation
-
-Output of ``docs/validation/validate_br_dr_canonical.py``. Each section runs BusinessReport (and its auto-constructed DiagnosticReport) on a canonical DiD dataset and dumps summary + full_report + selected to_dict blocks. The purpose is to compare BR's prose output against published canonical interpretations and record divergences in ``br_dr_canonical_findings.md``.
-
-This file is regenerable; do not hand-edit.
-
-Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA).
-
----
-
-## Card & Krueger (1994): NJ/PA minimum wage
-Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA (control, $4.25 throughout). Outcome: full-time equivalent employment. N=310 stores.
-
-Canonical interpretation: no significant disemployment effect of the minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). The famous finding was that the CI included zero.
-
-### BusinessReport.summary()
-```
-Question: Did the NJ minimum-wage increase reduce fast-food employment? The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE). Statistically, the confidence interval includes zero; the data are consistent with no effect. Sample: 620 observations (462 treated, 158 control).
-```
-### BusinessReport.full_report()
-```markdown
-# Business Report: FTE employment
-
-**Question**: Did the NJ minimum-wage increase reduce fast-food employment?
-
-**Estimator**: `DiDResults`
-
-## Headline
-
-The NJ minimum-wage increase lifted FTE employment by 1.47 FTE (95% CI: -2.32 FTE to 5.27 FTE).
-
-Statistically, the confidence interval includes zero; the data are consistent with no effect.
-
-## Identifying Assumption
-
-Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group.
-
-## Pre-Trends
-
-- Pre-trends not computed: auto_diagnostics=False
-
-## Sensitivity (HonestDiD)
-
-- Sensitivity not computed: auto_diagnostics=False
-
-## Sample
-
-- Observations: 620
-- Treated: 462
-- Control: 158
-
-## References
-
-- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
-- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
-
-
-## Technical Appendix
-
-```
-======================================================================
-             Difference-in-Differences Estimation Results             
-======================================================================
-
-Observations:                    620
-Treated:                         462
-Control:                         158
-R-squared:                    0.0036
-Variance:                            HC1 heteroskedasticity-robust
-
-----------------------------------------------------------------------
-Parameter           Estimate    Std. Err.     t-stat      P>|t|      
-----------------------------------------------------------------------
-ATT                   1.4718       1.9320      0.762     0.4465      
-----------------------------------------------------------------------
-
-95% Confidence Interval: [-2.3224, 5.2660]
-CV (SE/|ATT|):                1.3127
-
-Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
-======================================================================
-```
-```
-### BusinessReport.to_dict() - headline + assumption + caveats
-```json
-{
-  "effect": 1.4718176338428604,
-  "se": 1.9320362599811534,
-  "ci_lower": -2.322358689575049,
-  "ci_upper": 5.26599395726077,
-  "alpha_was_honored": true,
-  "alpha_override_caveat": null,
-  "ci_level": 95,
-  "p_value": 0.4464732839915416,
-  "is_significant": false,
-  "near_significance_threshold": false,
-  "unit": "FTE",
-  "unit_kind": "unknown",
-  "sign": "positive",
-  "breakdown_M": null
-}
-```
-```json
-{
-  "parallel_trends_variant": "unconditional",
-  "no_anticipation": true,
-  "description": "Identification relies on the standard DiD parallel-trends assumption plus no anticipation of treatment by either group."
-}
-```
-```json
-[]
-```
-
----
-## Callaway-Sant'Anna benchmark (mpdta)
-Data: simulated county-level panel from R `did` package (Callaway & Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: log employment (`lemp`).
-
-Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) on treated counties; group-specific ATT(g,t) negative across cohorts. See CS (2021) Figures 1-2.
-
-### BusinessReport.summary()
-```
-Question: Did minimum-wage increases reduce county employment? The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients do not reject parallel trends; the test is moderately informative. See the sensitivity analysis below for bounded-violation guarantees. HonestDiD: the result remains significant under parallel-trends violations up to 1.3x the observed pre-period variation. Sample: 2,500 observations (309 treated, 191 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
-```
-### BusinessReport.full_report()
-```markdown
-# Business Report: Log employment
-
-**Question**: Did minimum-wage increases reduce county employment?
-
-**Estimator**: `CallawaySantAnnaResults`
-
-## Headline
-
-The state-level minimum wage increase reduced Log employment by 0.0214 log-points (95% CI: -0.0251 log-points to -0.0178 log-points).
-
-Statistically, the direction of the effect is strongly supported by the data.
-
-## Identifying Assumption
-
-Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation.
-
-## Pre-Trends
-
-- Verdict: `no_detected_violation` (joint p = 0.482)
-- Power tier: `moderately_powered`
-- Minimum detectable violation (MDV): 0.0105
-- MDV / |ATT|: 0.49
-
-## Sensitivity (HonestDiD)
-
-- Method: `relative_magnitude`
-- Breakdown M: 1.28
-- Conclusion: `robust_to_M_1.28`
-
-## Sample
-
-- Observations: 2,500
-- Treated: 309
-- Control: 191
-
-## Heterogeneity
-
-- Source: `event_study_effects_post`
-- N effects: 4
-- Range: -0.0293 to -0.00305
-- CV: 0.668
-- Sign consistent: True
-
-## Caveats
-
-- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
-- **INFO** — The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes.
-
-## Next Steps
-
-- Define target parameter
-  - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers.
-- State identification assumptions
-  - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions.
-- Compare with alternative estimators (SA, BJS, or Gardner)
-  - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity.
-- Report with and without covariates
-  - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification.
-
-## References
-
-- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics.
-- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
-- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
-
-
-## Technical Appendix
-
-```
-=====================================================================================
-            Callaway-Sant'Anna Staggered Difference-in-Differences Results           
-=====================================================================================
-
-Total observations:                  2500
-Treated units:                        309
-Never-treated units:                  191
-Treatment cohorts:                      3
-Time periods:                           5
-Control group:                 never_treated
-Base period:                    universal
-
--------------------------------------------------------------------------------------
-                   Overall Average Treatment Effect on the Treated                   
--------------------------------------------------------------------------------------
-Parameter           Estimate    Std. Err.     t-stat      P>|t|   Sig.
--------------------------------------------------------------------------------------
-ATT                  -0.0214       0.0019    -11.397     0.0000    ***
--------------------------------------------------------------------------------------
-
-95% Confidence Interval: [-0.0251, -0.0178]
-CV (SE/|ATT|):                0.0877
-
--------------------------------------------------------------------------------------
-                            Event Study (Dynamic) Effects                            
--------------------------------------------------------------------------------------
-Rel. Period         Estimate    Std. Err.     t-stat      P>|t|   Sig.
--------------------------------------------------------------------------------------
--4                    0.0023       0.0036      0.627     0.5309       
--3                   -0.0019       0.0023     -0.810     0.4179       
--2                   -0.0020       0.0022     -0.875     0.3818       
--1                    0.0000          nan        nan        nan       
-0                    -0.0293       0.0019    -15.137     0.0000    ***
-1                    -0.0235       0.0023    -10.111     0.0000    ***
-2                    -0.0134       0.0031     -4.373     0.0000    ***
-3                    -0.0031       0.0035     -0.884     0.3767       
--------------------------------------------------------------------------------------
-
-Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
-=====================================================================================
-```
-```
-### BusinessReport.to_dict() - headline + assumption + caveats
-```json
-{
-  "effect": -0.021448663176265446,
-  "se": 0.0018820025192546833,
-  "ci_lower": -0.025137320332818274,
-  "ci_upper": -0.01776000601971262,
-  "alpha_was_honored": true,
-  "alpha_override_caveat": null,
-  "ci_level": 95,
-  "p_value": 4.341504320370796e-30,
-  "is_significant": true,
-  "near_significance_threshold": false,
-  "unit": "log_points",
-  "unit_kind": "log_points",
-  "sign": "negative",
-  "breakdown_M": 1.2776496410369873
-}
-```
-```json
-{
-  "parallel_trends_variant": "conditional_or_group_time",
-  "no_anticipation": true,
-  "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation."
-}
-```
-```json
-{
-  "status": "computed",
-  "method": "joint_wald_event_study",
-  "joint_p_value": 0.4816505473216015,
-  "verdict": "no_detected_violation",
-  "n_pre_periods": 3,
-  "n_dropped_undefined": null,
-  "reason": null,
-  "df_denom": null,
-  "power_status": "ran",
-  "power_reason": null,
-  "power_tier": "moderately_powered",
-  "mdv": 0.010472079171705551,
-  "mdv_share_of_att": 0.48823924762330606,
-  "power_covariance_source": "diag_fallback_available_full_vcov_unused"
-}
-```
-```json
-{
-  "status": "computed",
-  "method": "relative_magnitude",
-  "breakdown_M": 1.2776496410369873,
-  "conclusion": "robust_to_M_1.28",
-  "grid": [
-    {
-      "M": 0.5,
-      "ci_lower": -0.026608074507223883,
-      "ci_upper": -0.008013136465533054,
-      "bound_lower": -0.022462755203290868,
-      "bound_upper": -0.01215845576946607,
-      "robust_to_zero": true
-    },
-    {
-      "M": 1.0,
-      "ci_lower": -0.03176022422413628,
-      "ci_upper": -0.0028609867486206544,
-      "bound_lower": -0.027614904920203267,
-      "bound_upper": -0.00700630605255367,
-      "robust_to_zero": true
-    },
-    {
-      "M": 1.5,
-      "ci_lower": -0.03691237394104868,
-      "ci_upper": 0.002291162968291743,
-      "bound_lower": -0.03276705463711566,
-      "bound_upper": -0.0018541563356412726,
-      "robust_to_zero": false
-    },
-    {
-      "M": 2.0,
-      "ci_lower": -0.04206452365796108,
-      "ci_upper": 0.007443312685204144,
-      "bound_lower": -0.03791920435402807,
-      "bound_upper": 0.0032979933812711283,
-      "robust_to_zero": false
-    }
-  ]
-}
-```
-```json
-[
-  {
-    "severity": "warning",
-    "topic": "bacon_contamination",
-    "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity."
-  },
-  {
-    "severity": "info",
-    "topic": "unit_policy",
-    "message": "The effect is reported in log-points as estimated; BusinessReport does not arithmetically translate log-points to percent or level changes. For small effects, log-points approximate percentage changes."
-  }
-]
-```
-
----
-## Cheng & Hoekstra (2013): Castle Doctrine laws
-Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. Outcome: homicide rate per 100k population.
-
-Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in homicide rates in states that adopted Castle Doctrine (no deterrent effect; if anything, an escalation).
-
-### BusinessReport.summary()
-```
-Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.00347); the headline should be treated as tentative pending the sensitivity analysis below. HonestDiD: the result is fragile — the confidence interval includes zero even at the smallest parallel-trends violations on the sensitivity grid. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
-```
-### BusinessReport.full_report()
-```markdown
-# Business Report: Homicide rate (per 100k)
-
-**Question**: Did Castle Doctrine law adoption change state homicide rates?
-
-**Estimator**: `CallawaySantAnnaResults`
-
-## Headline
-
-Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.323 per 100k population to 0.799 per 100k population).
-
-Statistically, the direction of the effect is strongly supported by the data.
-
-## Identifying Assumption
-
-Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation.
-
-## Pre-Trends
-
-- Verdict: `clear_violation` (joint p = 0.00347)
-- Power tier: `underpowered`
-- Minimum detectable violation (MDV): 0.732
-- MDV / |ATT|: 1.3
-
-## Sensitivity (HonestDiD)
-
-- Method: `relative_magnitude`
-- Breakdown M: 0
-- Conclusion: `fragile`
-
-## Sample
-
-- Observations: 539
-- Treated: 22
-- Control: 27
-
-## Heterogeneity
-
-- Source: `event_study_effects_post`
-- N effects: 6
-- Range: 0.237 to 0.764
-- CV: 0.348
-- Sign consistent: True
-
-## Caveats
-
-- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
-- **WARNING** — HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative.
-
-## Next Steps
-
-- Define target parameter
-  - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers.
-- State identification assumptions
-  - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions.
-- Compare with alternative estimators (SA, BJS, or Gardner)
-  - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity.
-- Report with and without covariates
-  - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification.
-
-## References
-
-- Callaway, B., & Sant'Anna, P. H. C. (2021). Difference-in-Differences with multiple time periods. Journal of Econometrics.
-- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
-- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
-
-
-## Technical Appendix
-
-```
-=====================================================================================
-            Callaway-Sant'Anna Staggered Difference-in-Differences Results           
-=====================================================================================
-
-Total observations:                   539
-Treated units:                         22
-Never-treated units:                   27
-Treatment cohorts:                      6
-Time periods:                          11
-Control group:                 never_treated
-Base period:                    universal
-
--------------------------------------------------------------------------------------
-                   Overall Average Treatment Effect on the Treated                   
--------------------------------------------------------------------------------------
-Parameter           Estimate    Std. Err.     t-stat      P>|t|   Sig.
--------------------------------------------------------------------------------------
-ATT                   0.5608       0.1216      4.613     0.0000    ***
--------------------------------------------------------------------------------------
-
-95% Confidence Interval: [0.3225, 0.7991]
-CV (SE/|ATT|):                0.2168
-
--------------------------------------------------------------------------------------
-                            Event Study (Dynamic) Effects                            
--------------------------------------------------------------------------------------
-Rel. Period         Estimate    Std. Err.     t-stat      P>|t|   Sig.
--------------------------------------------------------------------------------------
--10                  -0.3415       0.1462     -2.336     0.0195      *
--9                    0.3406       0.5526      0.616     0.5377       
--8                   -0.1465       0.1794     -0.816     0.4143       
--7                    0.1393       0.3426      0.406     0.6844       
--6                    0.2611       0.1574      1.659     0.0972      .
--5                   -0.0466       0.1215     -0.383     0.7015       
--4                    0.1224       0.1511      0.810     0.4180       
--3                    0.0783       0.1505      0.520     0.6030       
--2                    0.1541       0.1085      1.420     0.1555       
--1                    0.0000          nan        nan        nan       
-0                     0.4453       0.1606      2.772     0.0056     **
-1                     0.7074       0.1957      3.614     0.0003    ***
-2                     0.7642       0.1590      4.807     0.0000    ***
-3                     0.5525       0.1582      3.492     0.0005    ***
-4                     0.2367       0.1789      1.323     0.1859       
-5                     0.6463       0.1227      5.269     0.0000    ***
--------------------------------------------------------------------------------------
-
-Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
-=====================================================================================
-```
-```
-### BusinessReport.to_dict() - headline + assumption + caveats
-```json
-{
-  "effect": 0.5608256172839505,
-  "se": 0.12157293428086259,
-  "ci_lower": 0.32254704459860495,
-  "ci_upper": 0.7991041899692961,
-  "alpha_was_honored": true,
-  "alpha_override_caveat": null,
-  "ci_level": 95,
-  "p_value": 3.967463629059167e-06,
-  "is_significant": true,
-  "near_significance_threshold": false,
-  "unit": "per 100k population",
-  "unit_kind": "unknown",
-  "sign": "positive",
-  "breakdown_M": 0.0
-}
-```
-```json
-{
-  "parallel_trends_variant": "conditional_or_group_time",
-  "no_anticipation": true,
-  "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation."
-}
-```
-```json
-{
-  "status": "computed",
-  "method": "joint_wald_event_study",
-  "joint_p_value": 0.003469090217576798,
-  "verdict": "clear_violation",
-  "n_pre_periods": 9,
-  "n_dropped_undefined": null,
-  "reason": null,
-  "df_denom": null,
-  "power_status": "ran",
-  "power_reason": null,
-  "power_tier": "underpowered",
-  "mdv": 0.7318611799799601,
-  "mdv_share_of_att": 1.3049710238350487,
-  "power_covariance_source": "diag_fallback_available_full_vcov_unused"
-}
-```
-```json
-{
-  "status": "computed",
-  "method": "relative_magnitude",
-  "breakdown_M": 0.0,
-  "conclusion": "fragile",
-  "grid": [
-    {
-      "M": 0.5,
-      "ci_lower": -0.84457211437162,
-      "ci_upper": 1.9620045961559538,
-      "bound_lower": -0.6348485739226502,
-      "bound_upper": 1.752281055706984,
-      "robust_to_zero": false
-    },
-    {
-      "M": 1.0,
-      "ci_lower": -2.038136929186437,
-      "ci_upper": 3.1555694109707706,
-      "bound_lower": -1.8284133887374672,
-      "bound_upper": 2.9458458705218007,
-      "robust_to_zero": false
-    },
-    {
-      "M": 1.5,
-      "ci_lower": -3.231701744001254,
-      "ci_upper": 4.349134225785587,
-      "bound_lower": -3.021978203552284,
-      "bound_upper": 4.1394106853366175,
-      "robust_to_zero": false
-    },
-    {
-      "M": 2.0,
-      "ci_lower": -4.4252665588160705,
-      "ci_upper": 5.542699040600405,
-      "bound_lower": -4.215543018367101,
-      "bound_upper": 5.332975500151435,
-      "robust_to_zero": false
-    }
-  ]
-}
-```
-```json
-[
-  {
-    "severity": "warning",
-    "topic": "bacon_contamination",
-    "message": "Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity."
-  },
-  {
-    "severity": "warning",
-    "topic": "sensitivity_fragility",
-    "message": "HonestDiD breakdown value is 0: the result's confidence interval includes zero once parallel-trends violations reach less than half the observed pre-period variation. Treat the headline as tentative."
-  }
-]
-```
-
----
-## Castle Doctrine under Sun-Abraham (2021)
-Same dataset and research question; different estimator. Testing BR/DR cross-estimator narrative consistency.
-
-### BusinessReport.summary()
-```
-Question: Did Castle Doctrine law adoption change state homicide rates? Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population). Statistically, the direction of the effect is strongly supported by the data. Pre-treatment event-study coefficients clearly reject parallel trends (joint p = 0.0128); the headline should be treated as tentative. Sample: 539 observations (22 treated, 27 control). Caveat: Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
-```
-### BusinessReport.full_report()
-```markdown
-# Business Report: Homicide rate (per 100k)
-
-**Question**: Did Castle Doctrine law adoption change state homicide rates?
-
-**Estimator**: `SunAbrahamResults`
-
-## Headline
-
-Castle Doctrine law adoption worsened Homicide rate (per 100k) by 0.561 per 100k population (95% CI: 0.324 per 100k population to 0.798 per 100k population).
-
-Statistically, the direction of the effect is strongly supported by the data.
-
-## Identifying Assumption
-
-Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation.
-
-## Pre-Trends
-
-- Verdict: `clear_violation` (joint p = 0.0128)
-- Power tier: `moderately_powered`
-- Minimum detectable violation (MDV): 0.551
-- MDV / |ATT|: 0.98
-
-## Sensitivity (HonestDiD)
-
-- Sensitivity not computed: sensitivity is not applicable to SunAbrahamResults.
-
-## Sample
-
-- Observations: 539
-- Treated: 22
-- Control: 27
-
-## Heterogeneity
-
-- Source: `event_study_effects_post`
-- N effects: 6
-- Range: 0.237 to 0.764
-- CV: 0.348
-- Sign consistent: True
-
-## Caveats
-
-- **WARNING** — Goodman-Bacon decomposition places 32% of TWFE weight on 'forbidden' later-vs-earlier comparisons. A TWFE benchmark on this rollout would be materially biased under heterogeneous effects; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity.
-
-## Next Steps
-
-- Define target parameter
-  - _why_: State explicitly what causal effect you are estimating (ATT, ATT(g,t), weighted/unweighted) and what policy question it answers.
-- State identification assumptions
-  - _why_: Name the parallel trends variant you are invoking (unconditional, conditional, PT-GT-NYT, etc.), the no-anticipation assumption, and any overlap conditions.
-- Specification-based falsification
-  - _why_: Compare results across control group definitions (never_treated vs not_yet_treated) and anticipation settings to assess robustness.
-- Compare with alternative estimators (CS, BJS, or Gardner)
-  - _why_: Agreement across estimators with different assumptions strengthens conclusions. Disagreement reveals sensitivity.
-- Report with and without covariates
-  - _why_: Shows whether results are sensitive to covariate conditioning. Large shifts suggest covariates are driving identification.
-
-## References
-
-- Sun, L., & Abraham, S. (2021). Estimating dynamic treatment effects in event studies. Journal of Econometrics.
-- Rambachan, A., & Roth, J. (2023). A More Credible Approach to Parallel Trends. Review of Economic Studies.
-- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & Sant'Anna, P. H. C. (2025). Difference-in-Differences Designs: A Practitioner's Guide.
-
-
-## Technical Appendix
-
-```
-=====================================================================================
-                  Sun-Abraham Interaction-Weighted Estimator Results                 
-=====================================================================================
-
-Total observations:                   539
-Treated units:                         22
-Control units:                         27
-Treatment cohorts:                      6
-Time periods:                          11
-Control group:                 never_treated
-
--------------------------------------------------------------------------------------
-                   Overall Average Treatment Effect on the Treated                   
--------------------------------------------------------------------------------------
-Parameter           Estimate    Std. Err.     t-stat      P>|t|   Sig.
--------------------------------------------------------------------------------------
-ATT                   0.5608       0.1208      4.642     0.0000    ***
--------------------------------------------------------------------------------------
-
-95% Confidence Interval: [0.3240, 0.7976]
-CV (SE/|ATT|):                0.2154
-
--------------------------------------------------------------------------------------
-                            Event Study (Dynamic) Effects                            
--------------------------------------------------------------------------------------
-Rel. Period         Estimate    Std. Err.     t-stat      P>|t|   Sig.
--------------------------------------------------------------------------------------
--10                  -0.3415       0.1566     -2.181     0.0292      *
--9                    0.3406       0.1067      3.191     0.0014     **
--8                   -0.1465       0.1379     -1.062     0.2882       
--7                    0.1393       0.3326      0.419     0.6754       
--6                    0.2611       0.1646      1.586     0.1128       
--5                   -0.0466       0.1181     -0.394     0.6933       
--4                    0.1224       0.1344      0.911     0.3625       
--3                    0.0783       0.1576      0.497     0.6194       
--2                    0.1541       0.0957      1.610     0.1075       
-0                     0.4453       0.1627      2.737     0.0062     **
-1                     0.7074       0.1903      3.716     0.0002    ***
-2                     0.7642       0.1612      4.739     0.0000    ***
-3                     0.5525       0.1646      3.357     0.0008    ***
-4                     0.2367       0.1909      1.240     0.2150       
-5                     0.6463       0.1313      4.921     0.0000    ***
--------------------------------------------------------------------------------------
-
-Signif. codes: '***' 0.001, '**' 0.01, '*' 0.05, '.' 0.1
-=====================================================================================
-```
-```
-### BusinessReport.to_dict() - headline + assumption
-```json
-{
-  "effect": 0.5608256172839505,
-  "se": 0.12081241968043965,
-  "ci_lower": 0.3240376258251508,
-  "ci_upper": 0.7976136087427503,
-  "alpha_was_honored": true,
-  "alpha_override_caveat": null,
-  "ci_level": 95,
-  "p_value": 3.448543002483855e-06,
-  "is_significant": true,
-  "near_significance_threshold": false,
-  "unit": "per 100k population",
-  "unit_kind": "unknown",
-  "sign": "positive",
-  "breakdown_M": null
-}
-```
-```json
-{
-  "parallel_trends_variant": "conditional_or_group_time",
-  "no_anticipation": true,
-  "description": "Identification relies on parallel trends across treatment cohorts and time periods (group-time ATT), plus no anticipation."
-}
-```
-
----
diff --git a/docs/validation/validate_br_dr_canonical.py b/docs/validation/validate_br_dr_canonical.py
deleted file mode 100644
index cc411551..00000000
--- a/docs/validation/validate_br_dr_canonical.py
+++ /dev/null
@@ -1,308 +0,0 @@
-"""Run BusinessReport / DiagnosticReport on canonical DiD datasets.
-
-Writes ``docs/validation/br_dr_canonical_validation.md`` with the
-full BR ``summary()`` + ``full_report()`` + selected ``to_dict()``
-blocks for each dataset. The markdown output is the reviewable
-artifact; compare it against canonical literature interpretations
-and record any divergences in
-``docs/validation/br_dr_canonical_findings.md``.
-
-Purpose: BR/DR gap #4 (real-dataset validation) — synthetic-DGP
-tests pass but we haven't checked whether the prose output matches
-canonical interpretations of applied work.
-
-Run via: ``python docs/validation/validate_br_dr_canonical.py``.
-"""
-
-from __future__ import annotations
-
-import json
-import sys
-import warnings
-from pathlib import Path
-
-import numpy as np
-
-from diff_diff import (
-    BusinessReport,
-    CallawaySantAnna,
-    DifferenceInDifferences,
-    SunAbraham,
-)
-from diff_diff.datasets import (
-    load_card_krueger,
-    load_castle_doctrine,
-    load_mpdta,
-)
-
-OUT_PATH = Path(__file__).parent / "br_dr_canonical_validation.md"
-
-
-def _section(title: str, level: int = 2) -> str:
-    return "#" * level + " " + title + "\n"
-
-
-def _fence(body: str, lang: str = "") -> str:
-    return f"```{lang}\n{body.rstrip()}\n```\n"
-
-
-def _dump_block(name: str, block: dict) -> str:
-    return _fence(json.dumps(block, indent=2, default=str), "json")
-
-
-def _card_krueger_section() -> str:
-    """Card & Krueger (1994) minimum wage — classic 2x2 DiD.
-
-    Canonical finding: no significant negative effect of NJ minimum-wage
-    increase on fast-food employment; published ATT ~ +2.8 FTE or
-    approximately 0.6 FTE per store depending on specification. CI
-    includes zero; direction positive.
-    """
-    parts = [_section("Card & Krueger (1994): NJ/PA minimum wage", 2)]
-    ck = load_card_krueger()
-    # Reshape wide -> long per the docstring example.
-    ck_long = ck.melt(
-        id_vars=["store_id", "state", "treated"],
-        value_vars=["emp_pre", "emp_post"],
-        var_name="period",
-        value_name="employment",
-    )
-    ck_long["post"] = (ck_long["period"] == "emp_post").astype(int)
-    did = DifferenceInDifferences()
-    fit = did.fit(ck_long, outcome="employment", treatment="treated", time="post")
-    br = BusinessReport(
-        fit,
-        outcome_label="FTE employment",
-        outcome_unit="FTE",
-        outcome_direction="higher_is_better",
-        business_question="Did the NJ minimum-wage increase reduce fast-food employment?",
-        treatment_label="the NJ minimum-wage increase",
-        auto_diagnostics=False,  # 2x2 PT needs manual column kwargs; run without for now
-    )
-    parts.append(
-        "Data: NJ (treated, min wage $4.25 -> $5.05 on 1992-04-01) vs PA "
-        "(control, $4.25 throughout). Outcome: full-time equivalent employment. "
-        f"N={len(ck)} stores.\n\n"
-    )
-    parts.append(
-        "Canonical interpretation: no significant disemployment effect of the "
-        "minimum-wage increase; published ATT ~ +0.59 FTE (positive direction). "
-        "The famous finding was that the CI included zero.\n\n"
-    )
-    parts.append(_section("BusinessReport.summary()", 3))
-    parts.append(_fence(br.summary()))
-    parts.append(_section("BusinessReport.full_report()", 3))
-    parts.append(_fence(br.full_report(), "markdown"))
-    parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3))
-    d = br.to_dict()
-    parts.append(_dump_block("headline", d.get("headline", {})))
-    parts.append(_dump_block("assumption", d.get("assumption", {})))
-    parts.append(_dump_block("caveats", d.get("caveats", [])))
-    parts.append("\n---\n")
-    return "".join(parts)
-
-
-def _mpdta_section() -> str:
-    """Callaway-Sant'Anna benchmark (mpdta): county-level log employment
-    under staggered minimum-wage increases.
-
-    Canonical finding: CS aggregate ATT roughly -0.04 to -0.05 on log
-    employment (i.e., ~4-5% employment decline for treated counties).
-    Group-level ATT(g,t) shown in CS Figure 1.
-    """
-    parts = [_section("Callaway-Sant'Anna benchmark (mpdta)", 2)]
-    df = load_mpdta()
-    cs = CallawaySantAnna(base_period="universal")
-    fit = cs.fit(
-        df,
-        outcome="lemp",
-        unit="countyreal",
-        time="year",
-        first_treat="first_treat",
-        aggregate="event_study",
-    )
-    br = BusinessReport(
-        fit,
-        outcome_label="Log employment",
-        outcome_unit="log_points",
-        outcome_direction="higher_is_better",
-        business_question="Did minimum-wage increases reduce county employment?",
-        treatment_label="the state-level minimum wage increase",
-        data=df,
-        outcome="lemp",
-        unit="countyreal",
-        time="year",
-        first_treat="first_treat",
-    )
-    parts.append(
-        "Data: simulated county-level panel from R `did` package (Callaway & "
-        "Sant'Anna 2021), 2003-2007, staggered minimum-wage increases. Outcome: "
-        "log employment (`lemp`).\n\n"
-    )
-    parts.append(
-        "Canonical interpretation: CS aggregate ATT ~ -0.04 to -0.05 (log points) "
-        "on treated counties; group-specific ATT(g,t) negative across cohorts. "
-        "See CS (2021) Figures 1-2.\n\n"
-    )
-    parts.append(_section("BusinessReport.summary()", 3))
-    parts.append(_fence(br.summary()))
-    parts.append(_section("BusinessReport.full_report()", 3))
-    parts.append(_fence(br.full_report(), "markdown"))
-    parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3))
-    d = br.to_dict()
-    parts.append(_dump_block("headline", d.get("headline", {})))
-    parts.append(_dump_block("assumption", d.get("assumption", {})))
-    parts.append(_dump_block("pre_trends", d.get("pre_trends", {})))
-    parts.append(_dump_block("sensitivity", d.get("sensitivity", {})))
-    parts.append(_dump_block("caveats", d.get("caveats", [])))
-    parts.append("\n---\n")
-    return "".join(parts)
-
-
-def _castle_doctrine_section() -> str:
-    """Cheng & Hoekstra (2013): staggered adoption of Castle Doctrine laws.
-
-    Canonical finding: ~8% increase in homicide rates in adopting
-    states; no deterrent effect on burglary or other crimes.
-    """
-    parts = [_section("Cheng & Hoekstra (2013): Castle Doctrine laws", 2)]
-    df = load_castle_doctrine()
-    # CS with never-treated as control; outcome = homicide rate.
-    cs = CallawaySantAnna(base_period="universal", control_group="never_treated")
-    fit = cs.fit(
-        df,
-        outcome="homicide_rate",
-        unit="state",
-        time="year",
-        first_treat="first_treat",
-        aggregate="event_study",
-    )
-    br = BusinessReport(
-        fit,
-        outcome_label="Homicide rate (per 100k)",
-        outcome_unit="per 100k population",
-        outcome_direction="lower_is_better",
-        business_question=(
-            "Did Castle Doctrine law adoption change state homicide rates?"
-        ),
-        treatment_label="Castle Doctrine law adoption",
-        data=df,
-        outcome="homicide_rate",
-        unit="state",
-        time="year",
-        first_treat="first_treat",
-    )
-    parts.append(
-        "Data: state-year panel, staggered Castle Doctrine law adoption 2005-2009. "
-        "Outcome: homicide rate per 100k population.\n\n"
-    )
-    parts.append(
-        "Canonical interpretation: Cheng & Hoekstra (2013) found ~8% increase in "
-        "homicide rates in states that adopted Castle Doctrine (no deterrent "
-        "effect; if anything, an escalation).\n\n"
-    )
-    parts.append(_section("BusinessReport.summary()", 3))
-    parts.append(_fence(br.summary()))
-    parts.append(_section("BusinessReport.full_report()", 3))
-    parts.append(_fence(br.full_report(), "markdown"))
-    parts.append(_section("BusinessReport.to_dict() - headline + assumption + caveats", 3))
-    d = br.to_dict()
-    parts.append(_dump_block("headline", d.get("headline", {})))
-    parts.append(_dump_block("assumption", d.get("assumption", {})))
-    parts.append(_dump_block("pre_trends", d.get("pre_trends", {})))
-    parts.append(_dump_block("sensitivity", d.get("sensitivity", {})))
-    parts.append(_dump_block("caveats", d.get("caveats", [])))
-    parts.append("\n---\n")
-    return "".join(parts)
-
-
-def _castle_doctrine_sun_abraham_section() -> str:
-    """Same Castle Doctrine dataset but run through Sun-Abraham, as a
-    cross-estimator consistency check. If SA and CS narrate the same
-    canonical finding differently, that's a BR/DR source-faithfulness
-    issue.
-    """
-    parts = [_section("Castle Doctrine under Sun-Abraham (2021)", 2)]
-    df = load_castle_doctrine()
-    sa = SunAbraham()
-    fit = sa.fit(
-        df,
-        outcome="homicide_rate",
-        unit="state",
-        time="year",
-        first_treat="first_treat",
-    )
-    br = BusinessReport(
-        fit,
-        outcome_label="Homicide rate (per 100k)",
-        outcome_unit="per 100k population",
-        outcome_direction="lower_is_better",
-        business_question=(
-            "Did Castle Doctrine law adoption change state homicide rates?"
-        ),
-        treatment_label="Castle Doctrine law adoption",
-        data=df,
-        outcome="homicide_rate",
-        unit="state",
-        time="year",
-        first_treat="first_treat",
-    )
-    parts.append(
-        "Same dataset and research question; different estimator. Testing BR/DR "
-        "cross-estimator narrative consistency.\n\n"
-    )
-    parts.append(_section("BusinessReport.summary()", 3))
-    parts.append(_fence(br.summary()))
-    parts.append(_section("BusinessReport.full_report()", 3))
-    parts.append(_fence(br.full_report(), "markdown"))
-    parts.append(_section("BusinessReport.to_dict() - headline + assumption", 3))
-    d = br.to_dict()
-    parts.append(_dump_block("headline", d.get("headline", {})))
-    parts.append(_dump_block("assumption", d.get("assumption", {})))
-    parts.append("\n---\n")
-    return "".join(parts)
-
-
-def main() -> int:
-    warnings.filterwarnings("ignore")
-    np.random.seed(42)
-
-    header = (
-        "# BR / DR canonical-dataset validation\n\n"
-        "Output of ``docs/validation/validate_br_dr_canonical.py``. Each section "
-        "runs BusinessReport (and its auto-constructed DiagnosticReport) on a "
-        "canonical DiD dataset and dumps summary + full_report + selected "
-        "to_dict blocks. The purpose is to compare BR's prose output against "
-        "published canonical interpretations and record divergences in "
-        "``br_dr_canonical_findings.md``.\n\n"
-        "This file is regenerable; do not hand-edit.\n\n"
-        "Datasets covered: Card-Krueger (1994), mpdta (Callaway-Sant'Anna 2021 "
-        "benchmark), Castle Doctrine (Cheng-Hoekstra 2013, both CS and SA).\n\n"
-        "---\n\n"
-    )
-
-    sections = [header]
-    for name, fn in (
-        ("card_krueger", _card_krueger_section),
-        ("mpdta", _mpdta_section),
-        ("castle_doctrine_cs", _castle_doctrine_section),
-        ("castle_doctrine_sa", _castle_doctrine_sun_abraham_section),
-    ):
-        print(f"Running {name} ...", file=sys.stderr)
-        try:
-            sections.append(fn())
-        except Exception as exc:  # noqa: BLE001
-            sections.append(
-                _section(f"{name} (ERROR)", 2)
-                + _fence(f"{type(exc).__name__}: {exc}")
-                + "\n---\n"
-            )
-            print(f"  {type(exc).__name__}: {exc}", file=sys.stderr)
-
-    OUT_PATH.write_text("".join(sections))
-    print(f"Wrote {OUT_PATH}", file=sys.stderr)
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/tests/test_br_dr_canonical_datasets.py b/tests/test_br_dr_canonical_datasets.py
new file mode 100644
index 00000000..3fe7381c
--- /dev/null
+++ b/tests/test_br_dr_canonical_datasets.py
@@ -0,0 +1,356 @@
+"""Canonical-dataset regression guards for BusinessReport / DiagnosticReport.
+
+Closes BR/DR foundation gap #4 (real-dataset validation): the risk was
+that BR/DR's prose could silently diverge from canonical interpretations
+of applied work without synthetic-DGP tests catching it. These tests
+run BR on four canonical fits and assert direction / verdict / tier
+properties that should hold regardless of small data-aggregation
+differences between the bundled dataset and the published author
+sample.
+
+Assertions are property-level, not exact-match:
+- Sign of the point estimate.
+- Whether the CI includes zero.
+- Pre-trends verdict bin (``no_detected_violation`` vs
+  ``clear_violation``).
+- HonestDiD sensitivity tier (robust vs fragile, via ``breakdown_M``).
+- Cross-estimator consistency (CS and SA produce the same direction
+  and verdict on the same data).
+
+These tests use the ``_construct_*`` fallback data from
+``diff_diff.datasets`` to avoid network dependency in CI. The
+construction targets match the published summary statistics, so
+canonical-direction / canonical-verdict properties hold.
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import pytest
+
+from diff_diff import (
+    BusinessReport,
+    CallawaySantAnna,
+    DifferenceInDifferences,
+    SunAbraham,
+)
+from diff_diff.datasets import (
+    _construct_card_krueger_data,
+    _construct_castle_doctrine_data,
+    _construct_mpdta_data,
+)
+
+
+@pytest.fixture(scope="module")
+def card_krueger_long():
+    """Card-Krueger dataset reshaped wide -> long for DiD fitting."""
+    warnings.filterwarnings("ignore")
+    ck = _construct_card_krueger_data()
+    ck_long = ck.melt(
+        id_vars=["store_id", "state", "treated"],
+        value_vars=["emp_pre", "emp_post"],
+        var_name="period",
+        value_name="employment",
+    )
+    ck_long["post"] = (ck_long["period"] == "emp_post").astype(int)
+    return ck_long
+
+
+@pytest.fixture(scope="module")
+def mpdta_panel():
+    """Callaway-Sant'Anna benchmark (mpdta) as constructed by the fallback."""
+    warnings.filterwarnings("ignore")
+    return _construct_mpdta_data()
+
+
+@pytest.fixture(scope="module")
+def castle_panel():
+    """Cheng-Hoekstra Castle Doctrine dataset as constructed by the fallback."""
+    warnings.filterwarnings("ignore")
+    return _construct_castle_doctrine_data()
+
+
+class TestCardKruegerCanonicalDirection:
+    """Card & Krueger (1994): NJ minimum-wage increase vs PA control.
+
+    Canonical finding: no significant disemployment effect; published
+    ATT is positive (~+0.59 FTE per store) but the CI includes zero.
+    """
+
+    def test_no_significant_disemployment(self, card_krueger_long):
+        did = DifferenceInDifferences().fit(
+            card_krueger_long,
+            outcome="employment",
+            treatment="treated",
+            time="post",
+        )
+        br = BusinessReport(
+            did,
+            outcome_label="FTE employment",
+            outcome_unit="FTE",
+            treatment_label="the NJ minimum-wage increase",
+            outcome_direction="higher_is_better",
+            auto_diagnostics=False,
+        )
+        h = br.to_dict()["headline"]
+        # Canonical: positive sign (no disemployment, if anything a
+        # small positive lift).
+        assert h["sign"] == "positive", (
+            f"Card-Krueger canonical finding is a positive ATT; got "
+            f"sign={h['sign']!r}, effect={h['effect']!r}"
+        )
+        # Canonical: CI includes zero -> not statistically significant.
+        assert h["ci_lower"] < 0 < h["ci_upper"], (
+            f"Card-Krueger canonical finding is CI includes zero; got "
+            f"[{h['ci_lower']}, {h['ci_upper']}]"
+        )
+        assert h["is_significant"] is False
+        # BR prose must name this in stakeholder-readable language.
+        summary = br.summary().lower()
+        assert "consistent with no effect" in summary, (
+            f"BR summary must report 'consistent with no effect' on "
+            f"Card-Krueger. Got: {summary!r}"
+        )
+
+    def test_treatment_label_abbreviation_preserved(self, card_krueger_long):
+        """The ``NJ`` abbreviation in the treatment label must survive
+        BR's sentence capitalization (regression for the
+        ``str.capitalize()`` bug surfaced by this dataset).
+        """
+        did = DifferenceInDifferences().fit(
+            card_krueger_long,
+            outcome="employment",
+            treatment="treated",
+            time="post",
+        )
+        br = BusinessReport(
+            did,
+            outcome_label="FTE employment",
+            treatment_label="the NJ minimum-wage increase",
+            auto_diagnostics=False,
+        )
+        assert "The NJ minimum-wage increase" in br.headline()
+
+
+class TestMpdtaCanonicalDirection:
+    """Callaway-Sant'Anna benchmark (mpdta): staggered minimum-wage
+    increases, log employment outcome.
+
+    Canonical finding: aggregate ATT is negative; the published fit is
+    robust under HonestDiD sensitivity; pre-trends do not reject
+    parallel trends.
+    """
+
+    def test_negative_att_robust_sensitivity_clean_pretrends(self, mpdta_panel):
+        cs = CallawaySantAnna(base_period="universal").fit(
+            mpdta_panel,
+            outcome="lemp",
+            unit="countyreal",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Log employment",
+            outcome_unit="log_points",
+            treatment_label="the state-level minimum wage increase",
+            outcome_direction="higher_is_better",
+            data=mpdta_panel,
+            outcome="lemp",
+            unit="countyreal",
+            time="year",
+            first_treat="first_treat",
+        )
+        d = br.to_dict()
+        h = d["headline"]
+        # Canonical direction: ATT on log employment is negative.
+        assert (
+            h["sign"] == "negative"
+        ), f"mpdta canonical finding is negative ATT; got sign={h['sign']!r}"
+        # Canonical robustness: HonestDiD breakdown M > 1 means the
+        # result survives violations at least as large as the observed
+        # pre-period variation.
+        bkd = h.get("breakdown_M")
+        assert isinstance(bkd, (int, float)) and bkd > 1.0, (
+            f"mpdta canonical finding is robust sensitivity "
+            f"(breakdown_M > 1.0); got breakdown_M={bkd!r}"
+        )
+        # Canonical pre-trends: do not reject PT.
+        pt = d["pre_trends"]
+        assert pt.get("verdict") == "no_detected_violation", (
+            f"mpdta canonical finding is clean pre-trends "
+            f"(no_detected_violation); got verdict={pt.get('verdict')!r}"
+        )
+
+
+class TestCastleDoctrineCanonicalDirection:
+    """Cheng & Hoekstra (2013): Castle Doctrine / Stand Your Ground
+    laws staggered across U.S. states.
+
+    Canonical finding: ~8% INCREASE in homicide rates (no deterrent
+    effect; if anything, escalation). Pre-trends violation is a
+    well-known issue with this dataset; HonestDiD sensitivity
+    flags the headline as fragile.
+    """
+
+    def test_cs_positive_att_clear_violation_fragile_sensitivity(self, castle_panel):
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Homicide rate (per 100k)",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        d = br.to_dict()
+        h = d["headline"]
+        # Canonical direction: homicides went UP (positive ATT).
+        assert h["sign"] == "positive", (
+            f"Castle Doctrine canonical finding is positive ATT (homicide "
+            f"escalation); got sign={h['sign']!r}"
+        )
+        # Canonical: clear PT violation on this dataset.
+        pt = d["pre_trends"]
+        assert pt.get("verdict") == "clear_violation", (
+            f"Castle Doctrine canonical finding is clear PT violation; "
+            f"got verdict={pt.get('verdict')!r}"
+        )
+        # Canonical: HonestDiD flags fragility given the PT violation.
+        sens = d["sensitivity"]
+        assert sens.get("status") == "computed"
+        bkd = sens.get("breakdown_M")
+        assert isinstance(bkd, (int, float)) and bkd < 0.5, (
+            f"Castle Doctrine canonical finding is fragile sensitivity "
+            f"(breakdown_M < 0.5); got breakdown_M={bkd!r}"
+        )
+
+    def test_treatment_label_proper_noun_preserved(self, castle_panel):
+        """ "Castle Doctrine" must survive BR's sentence capitalization
+        (regression for the ``str.capitalize()`` bug surfaced by this
+        dataset).
+        """
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Homicide rate (per 100k)",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            auto_diagnostics=False,
+        )
+        assert "Castle Doctrine law adoption" in br.headline()
+
+    def test_breakdown_m_zero_uses_smallest_grid_point_wording(self, castle_panel):
+        """Castle Doctrine's fragile sensitivity surfaced a
+        ``breakdown_M == 0`` edge case in BR's summary wording. The
+        summary must not quote ``0x the pre-period variation``; it
+        must use the smallest-grid-point phrasing.
+        """
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Homicide rate (per 100k)",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        summary = br.summary()
+        bkd = br.to_dict()["headline"].get("breakdown_M")
+        # Sanity: this dataset actually produces the edge case.
+        assert isinstance(bkd, (int, float)) and bkd <= 0.05, (
+            f"This test assumes Castle Doctrine + CS produces "
+            f"breakdown_M <= 0.05; if not, the dataset or estimator "
+            f"changed. Got breakdown_M={bkd!r}"
+        )
+        # Must not render the degenerate "0x the pre-period variation"
+        # wording.
+        assert "0x" not in summary, (
+            f"Summary must not quote ``0x`` multiplier on edge-case " f"breakdown. Got: {summary!r}"
+        )
+        assert "smallest parallel-trends violations" in summary
+
+
+class TestCastleDoctrineCrossEstimatorConsistency:
+    """Running the same Castle Doctrine dataset through CS and SA must
+    produce consistent direction + PT verdict. SA is a natural
+    cross-check on the CS finding.
+    """
+
+    def test_sa_agrees_with_cs_on_direction_and_pt(self, castle_panel):
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        sa = SunAbraham().fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        br_cs = BusinessReport(
+            cs,
+            outcome_label="Homicide rate",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        br_sa = BusinessReport(
+            sa,
+            outcome_label="Homicide rate",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        # Direction must agree: both positive (homicides up).
+        assert br_cs.to_dict()["headline"]["sign"] == br_sa.to_dict()["headline"]["sign"]
+        assert br_cs.to_dict()["headline"]["sign"] == "positive"
+        # PT verdict must agree on the clear-violation bin (both
+        # estimators read the same underlying pre-period coefficients).
+        assert (
+            br_cs.to_dict()["pre_trends"]["verdict"]
+            == br_sa.to_dict()["pre_trends"]["verdict"]
+            == "clear_violation"
+        )

From 6e24014c3077052c39f35adcef2e4961f79b2ec2 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sun, 19 Apr 2026 20:39:03 -0400
Subject: [PATCH 3/3] Address PR #341 R1: gate HonestDiD "smallest grid point"
 wording on evaluated grid
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

R1 caught a semantic bug in the round-1 canonical-validation wording
fix. ``breakdown_M`` is the smallest M at which the robust CI
includes zero — an interpolated threshold between grid points — not
a claim about any specific grid point. Keying the "smallest grid
point fails" wording off ``breakdown_M <= 0.05`` was wrong: on a
grid starting at M=0 where the smallest evaluated point is still
robust (CI excludes zero), a small ``breakdown_M=0.03`` means
fragility emerges BETWEEN grid points, not at M=0.

Fix (both BR and DR):

- Added a ``_smallest_failing_grid_m`` helper (paired helpers in
  ``business_report.py`` and ``diagnostic_report.py``, intentionally
  duplicated with cross-reference comments per the parity rule from
  ``feedback_cross_surface_parity_audit.md``).
- Helper returns the smallest evaluated M on the grid if that point
  has ``robust_to_zero == False``, else ``None``.
- Fragile-sensitivity wording now fires "smallest M evaluated on the
  sensitivity grid (M = X)" ONLY when the helper returns a value;
  otherwise falls through to the numeric multiplier ``{bkd:.2g}x``.
- Castle Doctrine (Cheng-Hoekstra 2013) CS fit: grid starts at
  M=0.5, every point non-robust — new wording quotes "(M = 0.5)"
  instead of "0x the pre-period variation".
- Reviewer's counterexample (grid ``[0, 0.25, ...]`` with bkd=0.03,
  smallest point robust): wording falls through to "0.03x the
  pre-period variation", not "smallest grid point".

Tests:

- Rewrote ``TestCanonicalValidationSurfaceFixes`` on the BR side to
  build sensitivity schemas with explicit grids. Added paired cases:
  (a) smallest M fails, assert "smallest M evaluated"; (b) smallest
  M robust, breakdown 0.03, assert multiplier wording is used.
- Added ``TestDRFragilePhrasingIsGridAware`` on the DR side mirroring
  the same paired cases against ``_render_overall_interpretation``.
- Updated the Castle Doctrine canonical-dataset regression test to
  assert ``"M = 0.5"`` appears (actual smallest evaluated grid point).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/business_report.py           |  53 +++++++++---
 diff_diff/diagnostic_report.py         |  53 +++++++++---
 tests/test_br_dr_canonical_datasets.py |  21 +++--
 tests/test_business_report.py          | 106 +++++++++++++++++------
 tests/test_diagnostic_report.py        | 115 +++++++++++++++++++++++++
 5 files changed, 289 insertions(+), 59 deletions(-)

diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py
index bbbeadc7..36f19780 100644
--- a/diff_diff/business_report.py
+++ b/diff_diff/business_report.py
@@ -1854,6 +1854,30 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str:
     return "the confidence interval includes zero; the data are consistent with no effect"
 
 
+def _smallest_failing_grid_m(sens: Dict[str, Any]) -> Optional[float]:
+    """If the smallest evaluated M on the HonestDiD sensitivity grid
+    already has the robust CI including zero, return that M. Returns
+    ``None`` when the grid is missing or when the smallest evaluated
+    point is still robust — in the latter case ``breakdown_M`` is an
+    interpolated threshold between grid points, not a statement about
+    the smallest grid point itself.
+
+    Matches the twin helper in ``diagnostic_report.py``; keep the two
+    in sync for cross-surface parity.
+    """
+    grid_points = sens.get("grid") or []
+    sorted_grid = sorted(
+        (p for p in grid_points if isinstance(p.get("M"), (int, float))),
+        key=lambda p: p["M"],
+    )
+    if not sorted_grid:
+        return None
+    smallest = sorted_grid[0]
+    if not smallest.get("robust_to_zero", True):
+        return float(smallest["M"])
+    return None
+
+
 def _sentence_first_upper(text: str) -> str:
     """Uppercase only the first character of ``text``, preserving all
     other casing. Unlike ``str.capitalize()``, which lowercases every
@@ -2115,19 +2139,26 @@ def _render_summary(schema: Dict[str, Any]) -> str:
                 f"pre-period variation."
             )
         elif isinstance(bkd, (int, float)):
-            # Round-1 BR/DR canonical-validation (2026-04-19):
-            # ``breakdown_M`` at or near zero reads as "0x the
-            # pre-period variation" which is a degenerate sentence
-            # (zero-times-anything is zero). The correct wording when
-            # the CI includes zero at the smallest grid point is to
-            # say the result is fragile to essentially any nonzero
-            # violation, not to quote the ``0x`` multiplier.
-            if bkd <= 0.05:
+            # Round-1 BR/DR canonical-validation (2026-04-19) then
+            # tightened per CI review on PR #341 R1:
+            # ``breakdown_M`` is the smallest M at which the robust
+            # CI includes zero (interpolated between grid points) —
+            # not a claim about any specific grid point. Earlier fix
+            # keyed off ``bkd <= 0.05`` which incorrectly asserted
+            # "smallest grid point fails" even for grids that start
+            # at M=0 where the smallest evaluated point is still
+            # robust (e.g., grid=[0, 0.25, ...] with bkd=0.03). The
+            # "smallest grid point" wording is only accurate when
+            # the smallest evaluated M on the grid itself fails
+            # (``robust_to_zero == False``); otherwise fall through
+            # to the numeric multiplier.
+            smallest_failed_m = _smallest_failing_grid_m(sens)
+            if smallest_failed_m is not None:
                 sentences.append(
                     "HonestDiD: the result is fragile — the confidence "
-                    "interval includes zero even at the smallest "
-                    "parallel-trends violations on the sensitivity "
-                    "grid."
+                    "interval includes zero even at the smallest M "
+                    f"evaluated on the sensitivity grid (M = "
+                    f"{smallest_failed_m:.2g})."
                 )
             else:
                 sentences.append(
diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py
index 21ce917d..6ec87b2a 100644
--- a/diff_diff/diagnostic_report.py
+++ b/diff_diff/diagnostic_report.py
@@ -2780,6 +2780,32 @@ def _collect_pre_period_coefs(
     return results_list, n_dropped_undefined
 
 
+def _smallest_failing_grid_m_dr(sens: Dict[str, Any]) -> Optional[float]:
+    """Return the smallest evaluated M on the HonestDiD sensitivity
+    grid if it already has the robust CI including zero, else ``None``.
+    Matches ``business_report._smallest_failing_grid_m`` — both helpers
+    must stay in sync for cross-surface parity. See PR #341 R1 review.
+
+    ``breakdown_M`` is an interpolated threshold between grid points,
+    so "the smallest grid point fails" is only a valid claim when the
+    smallest actually-evaluated M has ``robust_to_zero == False``. On
+    a grid that starts at M=0 where the smallest evaluated point is
+    still robust, the breakdown value is information about what
+    happens between grid points — not at the smallest grid point.
+    """
+    grid_points = sens.get("grid") or []
+    sorted_grid = sorted(
+        (p for p in grid_points if isinstance(p.get("M"), (int, float))),
+        key=lambda p: p["M"],
+    )
+    if not sorted_grid:
+        return None
+    smallest = sorted_grid[0]
+    if not smallest.get("robust_to_zero", True):
+        return float(smallest["M"])
+    return None
+
+
 def _pt_verdict(p: Optional[float]) -> str:
     """Map a pre-trends joint p-value to the three-bin verdict enum.
 
@@ -3118,22 +3144,25 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str
                 f"pre-period variation."
             )
         else:
-            # Round-1 BR/DR canonical-validation (2026-04-19): the
-            # "fragile — CI includes zero once violations reach 0x
-            # the pre-period variation" wording is a degenerate
-            # sentence at the ``breakdown_M == 0`` edge case
-            # surfaced by the Cheng-Hoekstra (2013) Castle Doctrine
-            # dataset. Mirror BR's fix: when the breakdown value is
-            # at or near zero, say the CI includes zero at the
-            # smallest grid point rather than quoting a ``0x``
-            # multiplier.
+            # Round-1 BR/DR canonical-validation (2026-04-19) then
+            # tightened per CI review on PR #341 R1: the "smallest
+            # grid point" wording is only semantically correct when
+            # the smallest M actually evaluated on the sensitivity
+            # grid has ``robust_to_zero == False``. ``breakdown_M``
+            # is the interpolated threshold between grid points, so
+            # a small breakdown value on a grid starting at M=0
+            # (where the smallest evaluated point is still robust)
+            # would previously have been narrated as "smallest grid
+            # point fails" — stronger than the evaluated grid
+            # supports. Mirror BR's fix: check the grid directly.
             if isinstance(bkd, (int, float)):
-                if bkd <= 0.05:
+                smallest_failed_m = _smallest_failing_grid_m_dr(sens)
+                if smallest_failed_m is not None:
                     sentences.append(
                         "HonestDiD sensitivity: the result is fragile — "
                         "the confidence interval includes zero even at "
-                        "the smallest parallel-trends violations on the "
-                        "sensitivity grid."
+                        "the smallest M evaluated on the sensitivity "
+                        f"grid (M = {smallest_failed_m:.2g})."
                     )
                 else:
                     sentences.append(
diff --git a/tests/test_br_dr_canonical_datasets.py b/tests/test_br_dr_canonical_datasets.py
index 3fe7381c..10998a61 100644
--- a/tests/test_br_dr_canonical_datasets.py
+++ b/tests/test_br_dr_canonical_datasets.py
@@ -259,11 +259,14 @@ def test_treatment_label_proper_noun_preserved(self, castle_panel):
         )
         assert "Castle Doctrine law adoption" in br.headline()
 
-    def test_breakdown_m_zero_uses_smallest_grid_point_wording(self, castle_panel):
+    def test_breakdown_m_zero_uses_smallest_m_evaluated_wording(self, castle_panel):
         """Castle Doctrine's fragile sensitivity surfaced a
-        ``breakdown_M == 0`` edge case in BR's summary wording. The
-        summary must not quote ``0x the pre-period variation``; it
-        must use the smallest-grid-point phrasing.
+        ``breakdown_M == 0`` edge case. The default HonestDiD grid
+        starts at M=0.5, and every grid point has the robust CI
+        including zero — so the smallest-M-evaluated wording is
+        semantically accurate here. BR's summary must say ``smallest
+        M evaluated on the sensitivity grid (M = 0.5)`` and must not
+        quote the degenerate ``0x`` multiplier.
         """
         cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
             castle_panel,
@@ -292,12 +295,14 @@ def test_breakdown_m_zero_uses_smallest_grid_point_wording(self, castle_panel):
             f"breakdown_M <= 0.05; if not, the dataset or estimator "
             f"changed. Got breakdown_M={bkd!r}"
         )
-        # Must not render the degenerate "0x the pre-period variation"
-        # wording.
-        assert "0x" not in summary, (
+        # Must not render the degenerate ``0x`` multiplier.
+        assert "0x the pre-period variation" not in summary, (
             f"Summary must not quote ``0x`` multiplier on edge-case " f"breakdown. Got: {summary!r}"
         )
-        assert "smallest parallel-trends violations" in summary
+        # Must name the smallest evaluated grid point (0.5 for the
+        # default grid).
+        assert "smallest M evaluated on the sensitivity grid" in summary
+        assert "M = 0.5" in summary
 
 
 class TestCastleDoctrineCrossEstimatorConsistency:
diff --git a/tests/test_business_report.py b/tests/test_business_report.py
index 77e059e3..8ec51c2b 100644
--- a/tests/test_business_report.py
+++ b/tests/test_business_report.py
@@ -4004,11 +4004,28 @@ class CallawaySantAnnaResults:
         stub.inference_method = "analytical"
         return stub
 
-    def _fragile_dr_schema(self, breakdown_m: float):
+    def _fragile_dr_schema(self, breakdown_m: float, grid=None):
         """Build a fake DiagnosticReportResults whose ``sensitivity``
-        block carries the given ``breakdown_M`` value."""
+        block carries the given ``breakdown_M`` value and grid. Pass
+        ``grid`` as a list of ``{"M": float, "robust_to_zero": bool}``
+        dicts (other fields populated with plausible values).
+        """
         from diff_diff.diagnostic_report import DiagnosticReportResults
 
+        grid = grid if grid is not None else []
+        # Populate optional CI / bound fields so grid entries match
+        # the schema the BR/DR runners actually emit.
+        grid_full = [
+            {
+                "M": row["M"],
+                "ci_lower": row.get("ci_lower", 0.0),
+                "ci_upper": row.get("ci_upper", 0.0),
+                "bound_lower": row.get("bound_lower", 0.0),
+                "bound_upper": row.get("bound_upper", 0.0),
+                "robust_to_zero": row["robust_to_zero"],
+            }
+            for row in grid
+        ]
         schema = {
             "schema_version": "1.0",
             "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"},
@@ -4020,7 +4037,7 @@ def _fragile_dr_schema(self, breakdown_m: float):
                 "method": "relative_magnitude",
                 "breakdown_M": breakdown_m,
                 "conclusion": "fragile",
-                "grid": [],
+                "grid": grid_full,
             },
             "placebo": {"status": "skipped", "reason": "stub"},
             "bacon": {"status": "skipped", "reason": "stub"},
@@ -4097,50 +4114,83 @@ def test_treatment_label_preserves_proper_noun_case(self):
             "Castle Doctrine law adoption" in headline
         ), f"Proper-noun casing must be preserved. Got: {headline!r}"
 
-    def test_breakdown_m_zero_uses_smallest_grid_point_wording(self):
-        """Cheng-Hoekstra Castle Doctrine produces ``breakdown_M == 0``
-        under HonestDiD. The old wording "violations reach 0x the
-        pre-period variation" reads as a degenerate zero-times-variation
-        sentence. The fix switches to "includes zero even at the
-        smallest parallel-trends violations on the sensitivity grid"
-        for breakdown values at or near zero.
+    def test_smallest_grid_m_fails_uses_smallest_grid_point_wording(self):
+        """When the smallest M actually evaluated on the grid has
+        ``robust_to_zero == False``, the "smallest M evaluated" wording
+        is semantically correct. This is the Cheng-Hoekstra Castle
+        Doctrine pattern: default grid ``[0.5, 1.0, 1.5, 2.0]`` with
+        M=0.5 already non-robust.
         """
         stub = self._cs_like_stub_with_zero_breakdown()
-        dr = self._fragile_dr_schema(breakdown_m=0.0)
+        dr = self._fragile_dr_schema(
+            breakdown_m=0.0,
+            grid=[
+                {"M": 0.5, "robust_to_zero": False},
+                {"M": 1.0, "robust_to_zero": False},
+                {"M": 1.5, "robust_to_zero": False},
+                {"M": 2.0, "robust_to_zero": False},
+            ],
+        )
         br = BusinessReport(stub, diagnostics=dr)
         summary = br.summary()
-        assert "0x" not in summary, (
-            f"Summary must not render ``0x the pre-period variation``; "
-            f"that reads as zero-times-anything. Got: {summary!r}"
-        )
-        assert "smallest parallel-trends violations" in summary, (
-            f"Summary must use the smallest-grid-point wording at "
-            f"breakdown_M == 0. Got: {summary!r}"
+        # Must not render the degenerate multiplier form on the
+        # zero-breakdown case.
+        assert (
+            "0x the pre-period variation" not in summary
+        ), f"Summary must not quote ``0x`` multiplier. Got: {summary!r}"
+        # New wording quotes the actual smallest evaluated M.
+        assert "smallest M evaluated on the sensitivity grid" in summary, (
+            f"Summary must use the smallest-M-evaluated wording when the "
+            f"smallest grid point actually fails. Got: {summary!r}"
         )
-
-    def test_breakdown_m_small_positive_still_uses_smallest_grid_point_wording(self):
-        """Breakdown values just above zero (e.g., 0.03) should also
-        route through the smallest-grid-point wording — quoting
-        ``0.03x`` to a stakeholder is equally uninformative.
+        assert "M = 0.5" in summary
+
+    def test_smallest_grid_m_robust_falls_through_to_multiplier_wording(self):
+        """CI review on PR #341 R1: ``breakdown_M`` is the interpolated
+        threshold between grid points, not a claim about any specific
+        grid point. On a grid starting at M=0 where the smallest
+        evaluated point is still robust, a small ``breakdown_M=0.03``
+        does NOT mean the smallest grid point failed — it means
+        fragility emerges between grid points. The correct wording is
+        the numeric multiplier, not the smallest-grid-point claim.
         """
         stub = self._cs_like_stub_with_zero_breakdown()
-        dr = self._fragile_dr_schema(breakdown_m=0.03)
+        dr = self._fragile_dr_schema(
+            breakdown_m=0.03,
+            grid=[
+                # Smallest evaluated M (0) is still robust: CI excludes
+                # zero. Breakdown is interpolated somewhere between M=0
+                # and M=0.25.
+                {"M": 0.0, "robust_to_zero": True},
+                {"M": 0.25, "robust_to_zero": False},
+                {"M": 0.5, "robust_to_zero": False},
+            ],
+        )
         br = BusinessReport(stub, diagnostics=dr)
         summary = br.summary()
-        assert "smallest parallel-trends violations" in summary
-        assert "0.03x" not in summary
+        # Must NOT claim the smallest grid point failed — it didn't.
+        assert "smallest M evaluated on the sensitivity grid" not in summary, (
+            f"Summary must not assert ``smallest M evaluated fails`` when the "
+            f"smallest grid point is still robust. Got: {summary!r}"
+        )
+        # Correct wording quotes the numeric multiplier.
+        assert "0.03x" in summary, (
+            f"Fragile fit with robust smallest-M should quote the interpolated "
+            f"breakdown multiplier. Got: {summary!r}"
+        )
 
     def test_breakdown_m_normal_keeps_multiplier_wording(self):
         """Breakdown values at the usual fragile-but-nonzero range
         (e.g., 0.3) must still quote the ``0.3x`` multiplier — the
-        smallest-grid-point wording is only for the degenerate tail.
+        smallest-M-evaluated wording is only for grids whose smallest
+        actually-evaluated point is already non-robust.
         """
         stub = self._cs_like_stub_with_zero_breakdown()
         dr = self._fragile_dr_schema(breakdown_m=0.3)
         br = BusinessReport(stub, diagnostics=dr)
         summary = br.summary()
         assert "0.3x" in summary
-        assert "smallest parallel-trends violations" not in summary
+        assert "smallest M evaluated on the sensitivity grid" not in summary
 
 
 class TestBaconCaveatEstimatorAware:
diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py
index 2f3800a6..ba473a56 100644
--- a/tests/test_diagnostic_report.py
+++ b/tests/test_diagnostic_report.py
@@ -1891,6 +1891,121 @@ def test_full_report_has_headers(self, cs_fit):
         assert "## HonestDiD sensitivity" in md
 
 
+class TestDRFragilePhrasingIsGridAware:
+    """CI review on PR #341 R1: DR's ``overall_interpretation``
+    fragile-sensitivity sentence must be gated on the actual
+    evaluated grid, not just on ``breakdown_M``. ``breakdown_M`` is
+    the interpolated threshold between grid points; "smallest grid
+    point fails" is only a valid claim when the smallest actually-
+    evaluated M has ``robust_to_zero == False``. Mirrors the BR test
+    class ``TestCanonicalValidationSurfaceFixes``.
+    """
+
+    @staticmethod
+    def _grid(breakdown_m, grid_rows):
+        """Build a sensitivity block with a populated grid, matching
+        the schema ``_check_sensitivity`` emits."""
+        return {
+            "status": "ran",
+            "method": "relative_magnitude",
+            "breakdown_M": breakdown_m,
+            "conclusion": "fragile",
+            "grid": [
+                {
+                    "M": row["M"],
+                    "ci_lower": 0.0,
+                    "ci_upper": 0.0,
+                    "bound_lower": 0.0,
+                    "bound_upper": 0.0,
+                    "robust_to_zero": row["robust_to_zero"],
+                }
+                for row in grid_rows
+            ],
+        }
+
+    def _render(self, sens_block):
+        """Call the DR overall-interpretation renderer on a minimal
+        schema that has our sensitivity block and otherwise skipped
+        sections (so the fragile-sensitivity branch fires alone)."""
+        from diff_diff.diagnostic_report import _render_overall_interpretation
+
+        schema = {
+            "schema_version": "1.0",
+            "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"},
+            "headline_metric": {
+                "status": "ran",
+                "effect": 0.5,
+                "se": 0.1,
+                "p_value": 0.0,
+                "ci_lower": 0.3,
+                "ci_upper": 0.7,
+                "is_significant": True,
+                "sign": "positive",
+                "alpha": 0.05,
+            },
+            "parallel_trends": {"status": "skipped", "reason": "stub"},
+            "pretrends_power": {"status": "skipped", "reason": "stub"},
+            "sensitivity": sens_block,
+            "placebo": {"status": "skipped", "reason": "stub"},
+            "bacon": {"status": "skipped", "reason": "stub"},
+            "design_effect": {"status": "skipped", "reason": "stub"},
+            "heterogeneity": {"status": "skipped", "reason": "stub"},
+            "epv": {"status": "skipped", "reason": "stub"},
+            "estimator_native_diagnostics": {"status": "not_applicable"},
+            "skipped": {},
+            "warnings": [],
+            "next_steps": [],
+        }
+        return _render_overall_interpretation(schema, {})
+
+    def test_dr_smallest_grid_m_fails_uses_smallest_m_wording(self):
+        """Castle Doctrine pattern: grid ``[0.5, 1.0, ...]`` with M=0.5
+        already non-robust. DR emits "smallest M evaluated (M = 0.5)".
+        """
+        sens = self._grid(
+            breakdown_m=0.0,
+            grid_rows=[
+                {"M": 0.5, "robust_to_zero": False},
+                {"M": 1.0, "robust_to_zero": False},
+            ],
+        )
+        prose = self._render(sens)
+        assert "smallest M evaluated on the sensitivity grid" in prose
+        assert "M = 0.5" in prose
+        assert "0x the pre-period variation" not in prose
+
+    def test_dr_smallest_grid_m_robust_falls_through_to_multiplier(self):
+        """Grid starting at M=0 with smallest point still robust.
+        ``breakdown_M=0.03`` is the interpolated threshold between
+        M=0 and M=0.25; DR must NOT claim the smallest grid point
+        failed (it didn't) and must use the multiplier wording
+        instead.
+        """
+        sens = self._grid(
+            breakdown_m=0.03,
+            grid_rows=[
+                {"M": 0.0, "robust_to_zero": True},
+                {"M": 0.25, "robust_to_zero": False},
+            ],
+        )
+        prose = self._render(sens)
+        assert "smallest M evaluated on the sensitivity grid" not in prose
+        assert "0.03x" in prose
+
+    def test_dr_normal_fragile_keeps_multiplier(self):
+        """Normal fragile value (e.g., 0.3) still quotes the multiplier."""
+        sens = self._grid(
+            breakdown_m=0.3,
+            grid_rows=[
+                {"M": 0.5, "robust_to_zero": True},
+                {"M": 1.0, "robust_to_zero": True},
+            ],
+        )
+        prose = self._render(sens)
+        assert "0.3x" in prose
+        assert "smallest M evaluated on the sensitivity grid" not in prose
+
+
 # ---------------------------------------------------------------------------
 # Public result class
 # ---------------------------------------------------------------------------