diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py
index 2445251f..36f19780 100644
--- a/diff_diff/business_report.py
+++ b/diff_diff/business_report.py
@@ -1854,6 +1854,48 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str:
     return "the confidence interval includes zero; the data are consistent with no effect"
 
 
+def _smallest_failing_grid_m(sens: Dict[str, Any]) -> Optional[float]:
+    """If the smallest evaluated M on the HonestDiD sensitivity grid
+    already has the robust CI including zero, return that M. Returns
+    ``None`` when the grid is missing or when the smallest evaluated
+    point is still robust — in the latter case ``breakdown_M`` is an
+    interpolated threshold between grid points, not a statement about
+    the smallest grid point itself.
+
+    Matches the twin helper in ``diagnostic_report.py``; keep the two
+    in sync for cross-surface parity.
+    """
+    grid_points = sens.get("grid") or []
+    sorted_grid = sorted(
+        (p for p in grid_points if isinstance(p.get("M"), (int, float))),
+        key=lambda p: p["M"],
+    )
+    if not sorted_grid:
+        return None
+    smallest = sorted_grid[0]
+    if not smallest.get("robust_to_zero", True):
+        return float(smallest["M"])
+    return None
+
+
+def _sentence_first_upper(text: str) -> str:
+    """Uppercase only the first character of ``text``, preserving all
+    other casing. Unlike ``str.capitalize()``, which lowercases every
+    character after the first, this keeps user-supplied abbreviations
+    and proper nouns intact.
+
+    Examples
+    --------
+    >>> _sentence_first_upper("the NJ minimum-wage increase")
+    'The NJ minimum-wage increase'
+    >>> _sentence_first_upper("Castle Doctrine law adoption")
+    'Castle Doctrine law adoption'
+    """
+    if not text:
+        return text
+    return text[0].upper() + text[1:]
+
+
 def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str:
     """Return a direction-aware verb for the headline sentence.
 
@@ -1929,7 +1971,16 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str:
         # is not actually available.
         ci_str = " (inference unavailable: confidence interval is undefined for this fit)"
     by_clause = f" by {magnitude}" if effect != 0 else ""
-    return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}."
+    # Round-1 BR/DR canonical-validation (2026-04-19): Python's
+    # ``str.capitalize()`` lowercases everything except the first
+    # character, so ``"the NJ minimum-wage increase".capitalize()``
+    # returns ``"The nj minimum-wage increase"`` — flattening the
+    # ``NJ`` abbreviation. Real canonical datasets (Card-Krueger,
+    # Castle Doctrine) carry proper-noun / acronym tokens in the
+    # user-supplied ``treatment_label``, so preserve user casing and
+    # only ensure the first character is uppercase.
+    treatment_sentence = _sentence_first_upper(treatment)
+    return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}."
 
 
 def _render_summary(schema: Dict[str, Any]) -> str:
@@ -2088,11 +2139,33 @@ def _render_summary(schema: Dict[str, Any]) -> str:
                 f"pre-period variation."
             )
         elif isinstance(bkd, (int, float)):
-            sentences.append(
-                f"HonestDiD: the result is fragile — the confidence interval "
-                f"includes zero once violations reach {bkd:.2g}x the "
-                f"pre-period variation."
-            )
+            # Round-1 BR/DR canonical-validation (2026-04-19) then
+            # tightened per CI review on PR #341 R1:
+            # ``breakdown_M`` is the smallest M at which the robust
+            # CI includes zero (interpolated between grid points) —
+            # not a claim about any specific grid point. Earlier fix
+            # keyed off ``bkd <= 0.05`` which incorrectly asserted
+            # "smallest grid point fails" even for grids that start
+            # at M=0 where the smallest evaluated point is still
+            # robust (e.g., grid=[0, 0.25, ...] with bkd=0.03). The
+            # "smallest grid point" wording is only accurate when
+            # the smallest evaluated M on the grid itself fails
+            # (``robust_to_zero == False``); otherwise fall through
+            # to the numeric multiplier.
+            smallest_failed_m = _smallest_failing_grid_m(sens)
+            if smallest_failed_m is not None:
+                sentences.append(
+                    "HonestDiD: the result is fragile — the confidence "
+                    "interval includes zero even at the smallest M "
+                    f"evaluated on the sensitivity grid (M = "
+                    f"{smallest_failed_m:.2g})."
+                )
+            else:
+                sentences.append(
+                    f"HonestDiD: the result is fragile — the confidence "
+                    f"interval includes zero once violations reach {bkd:.2g}x "
+                    f"the pre-period variation."
+                )
 
     # Sample sentence. For fits with a dynamic comparison set (CS /
     # ContinuousDiD / StaggeredTripleDiff / EfficientDiD /
diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py
index 0fe798a9..6ec87b2a 100644
--- a/diff_diff/diagnostic_report.py
+++ b/diff_diff/diagnostic_report.py
@@ -2780,6 +2780,32 @@ def _collect_pre_period_coefs(
     return results_list, n_dropped_undefined
 
 
+def _smallest_failing_grid_m_dr(sens: Dict[str, Any]) -> Optional[float]:
+    """Return the smallest evaluated M on the HonestDiD sensitivity
+    grid if it already has the robust CI including zero, else ``None``.
+    Matches ``business_report._smallest_failing_grid_m`` — both helpers
+    must stay in sync for cross-surface parity. See PR #341 R1 review.
+
+    ``breakdown_M`` is an interpolated threshold between grid points,
+    so "the smallest grid point fails" is only a valid claim when the
+    smallest actually-evaluated M has ``robust_to_zero == False``. On
+    a grid that starts at M=0 where the smallest evaluated point is
+    still robust, the breakdown value is information about what
+    happens between grid points — not at the smallest grid point.
+    """
+    grid_points = sens.get("grid") or []
+    sorted_grid = sorted(
+        (p for p in grid_points if isinstance(p.get("M"), (int, float))),
+        key=lambda p: p["M"],
+    )
+    if not sorted_grid:
+        return None
+    smallest = sorted_grid[0]
+    if not smallest.get("robust_to_zero", True):
+        return float(smallest["M"])
+    return None
+
+
 def _pt_verdict(p: Optional[float]) -> str:
     """Map a pre-trends joint p-value to the three-bin verdict enum.
 
@@ -3118,13 +3144,33 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str
                 f"pre-period variation."
             )
         else:
-            sentences.append(
-                f"HonestDiD sensitivity: the result is fragile — the "
-                f"confidence interval includes zero once violations reach "
-                f"{bkd:.2g}x the pre-period variation."
-                if isinstance(bkd, (int, float))
-                else ""
-            )
+            # Round-1 BR/DR canonical-validation (2026-04-19) then
+            # tightened per CI review on PR #341 R1: the "smallest
+            # grid point" wording is only semantically correct when
+            # the smallest M actually evaluated on the sensitivity
+            # grid has ``robust_to_zero == False``. ``breakdown_M``
+            # is the interpolated threshold between grid points, so
+            # a small breakdown value on a grid starting at M=0
+            # (where the smallest evaluated point is still robust)
+            # would previously have been narrated as "smallest grid
+            # point fails" — stronger than the evaluated grid
+            # supports. Mirror BR's fix: check the grid directly.
+            if isinstance(bkd, (int, float)):
+                smallest_failed_m = _smallest_failing_grid_m_dr(sens)
+                if smallest_failed_m is not None:
+                    sentences.append(
+                        "HonestDiD sensitivity: the result is fragile — "
+                        "the confidence interval includes zero even at "
+                        "the smallest M evaluated on the sensitivity "
+                        f"grid (M = {smallest_failed_m:.2g})."
+                    )
+                else:
+                    sentences.append(
+                        f"HonestDiD sensitivity: the result is fragile — "
+                        f"the confidence interval includes zero once "
+                        f"violations reach {bkd:.2g}x the pre-period "
+                        f"variation."
+                    )
 
     # Sentence 4: one secondary caveat if present.
     bacon = schema.get("bacon") or {}
diff --git a/tests/test_br_dr_canonical_datasets.py b/tests/test_br_dr_canonical_datasets.py
new file mode 100644
index 00000000..10998a61
--- /dev/null
+++ b/tests/test_br_dr_canonical_datasets.py
@@ -0,0 +1,361 @@
+"""Canonical-dataset regression guards for BusinessReport / DiagnosticReport.
+
+Closes BR/DR foundation gap #4 (real-dataset validation): the risk was
+that BR/DR's prose could silently diverge from canonical interpretations
+of applied work without synthetic-DGP tests catching it. These tests
+run BR on four canonical fits and assert direction / verdict / tier
+properties that should hold regardless of small data-aggregation
+differences between the bundled dataset and the published author
+sample.
+
+Assertions are property-level, not exact-match:
+- Sign of the point estimate.
+- Whether the CI includes zero.
+- Pre-trends verdict bin (``no_detected_violation`` vs
+  ``clear_violation``).
+- HonestDiD sensitivity tier (robust vs fragile, via ``breakdown_M``).
+- Cross-estimator consistency (CS and SA produce the same direction
+  and verdict on the same data).
+
+These tests use the ``_construct_*`` fallback data from
+``diff_diff.datasets`` to avoid network dependency in CI. The
+construction targets match the published summary statistics, so
+canonical-direction / canonical-verdict properties hold.
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import pytest
+
+from diff_diff import (
+    BusinessReport,
+    CallawaySantAnna,
+    DifferenceInDifferences,
+    SunAbraham,
+)
+from diff_diff.datasets import (
+    _construct_card_krueger_data,
+    _construct_castle_doctrine_data,
+    _construct_mpdta_data,
+)
+
+
+@pytest.fixture(scope="module")
+def card_krueger_long():
+    """Card-Krueger dataset reshaped wide -> long for DiD fitting."""
+    warnings.filterwarnings("ignore")
+    ck = _construct_card_krueger_data()
+    ck_long = ck.melt(
+        id_vars=["store_id", "state", "treated"],
+        value_vars=["emp_pre", "emp_post"],
+        var_name="period",
+        value_name="employment",
+    )
+    ck_long["post"] = (ck_long["period"] == "emp_post").astype(int)
+    return ck_long
+
+
+@pytest.fixture(scope="module")
+def mpdta_panel():
+    """Callaway-Sant'Anna benchmark (mpdta) as constructed by the fallback."""
+    warnings.filterwarnings("ignore")
+    return _construct_mpdta_data()
+
+
+@pytest.fixture(scope="module")
+def castle_panel():
+    """Cheng-Hoekstra Castle Doctrine dataset as constructed by the fallback."""
+    warnings.filterwarnings("ignore")
+    return _construct_castle_doctrine_data()
+
+
+class TestCardKruegerCanonicalDirection:
+    """Card & Krueger (1994): NJ minimum-wage increase vs PA control.
+
+    Canonical finding: no significant disemployment effect; published
+    ATT is positive (~+0.59 FTE per store) but the CI includes zero.
+    """
+
+    def test_no_significant_disemployment(self, card_krueger_long):
+        did = DifferenceInDifferences().fit(
+            card_krueger_long,
+            outcome="employment",
+            treatment="treated",
+            time="post",
+        )
+        br = BusinessReport(
+            did,
+            outcome_label="FTE employment",
+            outcome_unit="FTE",
+            treatment_label="the NJ minimum-wage increase",
+            outcome_direction="higher_is_better",
+            auto_diagnostics=False,
+        )
+        h = br.to_dict()["headline"]
+        # Canonical: positive sign (no disemployment, if anything a
+        # small positive lift).
+        assert h["sign"] == "positive", (
+            f"Card-Krueger canonical finding is a positive ATT; got "
+            f"sign={h['sign']!r}, effect={h['effect']!r}"
+        )
+        # Canonical: CI includes zero -> not statistically significant.
+        assert h["ci_lower"] < 0 < h["ci_upper"], (
+            f"Card-Krueger canonical finding is CI includes zero; got "
+            f"[{h['ci_lower']}, {h['ci_upper']}]"
+        )
+        assert h["is_significant"] is False
+        # BR prose must name this in stakeholder-readable language.
+        summary = br.summary().lower()
+        assert "consistent with no effect" in summary, (
+            f"BR summary must report 'consistent with no effect' on "
+            f"Card-Krueger. Got: {summary!r}"
+        )
+
+    def test_treatment_label_abbreviation_preserved(self, card_krueger_long):
+        """The ``NJ`` abbreviation in the treatment label must survive
+        BR's sentence capitalization (regression for the
+        ``str.capitalize()`` bug surfaced by this dataset).
+        """
+        did = DifferenceInDifferences().fit(
+            card_krueger_long,
+            outcome="employment",
+            treatment="treated",
+            time="post",
+        )
+        br = BusinessReport(
+            did,
+            outcome_label="FTE employment",
+            treatment_label="the NJ minimum-wage increase",
+            auto_diagnostics=False,
+        )
+        assert "The NJ minimum-wage increase" in br.headline()
+
+
+class TestMpdtaCanonicalDirection:
+    """Callaway-Sant'Anna benchmark (mpdta): staggered minimum-wage
+    increases, log employment outcome.
+
+    Canonical finding: aggregate ATT is negative; the published fit is
+    robust under HonestDiD sensitivity; pre-trends do not reject
+    parallel trends.
+    """
+
+    def test_negative_att_robust_sensitivity_clean_pretrends(self, mpdta_panel):
+        cs = CallawaySantAnna(base_period="universal").fit(
+            mpdta_panel,
+            outcome="lemp",
+            unit="countyreal",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Log employment",
+            outcome_unit="log_points",
+            treatment_label="the state-level minimum wage increase",
+            outcome_direction="higher_is_better",
+            data=mpdta_panel,
+            outcome="lemp",
+            unit="countyreal",
+            time="year",
+            first_treat="first_treat",
+        )
+        d = br.to_dict()
+        h = d["headline"]
+        # Canonical direction: ATT on log employment is negative.
+        assert (
+            h["sign"] == "negative"
+        ), f"mpdta canonical finding is negative ATT; got sign={h['sign']!r}"
+        # Canonical robustness: HonestDiD breakdown M > 1 means the
+        # result survives violations at least as large as the observed
+        # pre-period variation.
+        bkd = h.get("breakdown_M")
+        assert isinstance(bkd, (int, float)) and bkd > 1.0, (
+            f"mpdta canonical finding is robust sensitivity "
+            f"(breakdown_M > 1.0); got breakdown_M={bkd!r}"
+        )
+        # Canonical pre-trends: do not reject PT.
+        pt = d["pre_trends"]
+        assert pt.get("verdict") == "no_detected_violation", (
+            f"mpdta canonical finding is clean pre-trends "
+            f"(no_detected_violation); got verdict={pt.get('verdict')!r}"
+        )
+
+
+class TestCastleDoctrineCanonicalDirection:
+    """Cheng & Hoekstra (2013): Castle Doctrine / Stand Your Ground
+    laws staggered across U.S. states.
+
+    Canonical finding: ~8% INCREASE in homicide rates (no deterrent
+    effect; if anything, escalation). Pre-trends violation is a
+    well-known issue with this dataset; HonestDiD sensitivity
+    flags the headline as fragile.
+    """
+
+    def test_cs_positive_att_clear_violation_fragile_sensitivity(self, castle_panel):
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Homicide rate (per 100k)",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        d = br.to_dict()
+        h = d["headline"]
+        # Canonical direction: homicides went UP (positive ATT).
+        assert h["sign"] == "positive", (
+            f"Castle Doctrine canonical finding is positive ATT (homicide "
+            f"escalation); got sign={h['sign']!r}"
+        )
+        # Canonical: clear PT violation on this dataset.
+        pt = d["pre_trends"]
+        assert pt.get("verdict") == "clear_violation", (
+            f"Castle Doctrine canonical finding is clear PT violation; "
+            f"got verdict={pt.get('verdict')!r}"
+        )
+        # Canonical: HonestDiD flags fragility given the PT violation.
+        sens = d["sensitivity"]
+        assert sens.get("status") == "computed"
+        bkd = sens.get("breakdown_M")
+        assert isinstance(bkd, (int, float)) and bkd < 0.5, (
+            f"Castle Doctrine canonical finding is fragile sensitivity "
+            f"(breakdown_M < 0.5); got breakdown_M={bkd!r}"
+        )
+
+    def test_treatment_label_proper_noun_preserved(self, castle_panel):
+        """ "Castle Doctrine" must survive BR's sentence capitalization
+        (regression for the ``str.capitalize()`` bug surfaced by this
+        dataset).
+        """
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Homicide rate (per 100k)",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            auto_diagnostics=False,
+        )
+        assert "Castle Doctrine law adoption" in br.headline()
+
+    def test_breakdown_m_zero_uses_smallest_m_evaluated_wording(self, castle_panel):
+        """Castle Doctrine's fragile sensitivity surfaced a
+        ``breakdown_M == 0`` edge case. The default HonestDiD grid
+        starts at M=0.5, and every grid point has the robust CI
+        including zero — so the smallest-M-evaluated wording is
+        semantically accurate here. BR's summary must say ``smallest
+        M evaluated on the sensitivity grid (M = 0.5)`` and must not
+        quote the degenerate ``0x`` multiplier.
+        """
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        br = BusinessReport(
+            cs,
+            outcome_label="Homicide rate (per 100k)",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        summary = br.summary()
+        bkd = br.to_dict()["headline"].get("breakdown_M")
+        # Sanity: this dataset actually produces the edge case.
+        assert isinstance(bkd, (int, float)) and bkd <= 0.05, (
+            f"This test assumes Castle Doctrine + CS produces "
+            f"breakdown_M <= 0.05; if not, the dataset or estimator "
+            f"changed. Got breakdown_M={bkd!r}"
+        )
+        # Must not render the degenerate ``0x`` multiplier.
+        assert "0x the pre-period variation" not in summary, (
+            f"Summary must not quote ``0x`` multiplier on edge-case " f"breakdown. Got: {summary!r}"
+        )
+        # Must name the smallest evaluated grid point (0.5 for the
+        # default grid).
+        assert "smallest M evaluated on the sensitivity grid" in summary
+        assert "M = 0.5" in summary
+
+
+class TestCastleDoctrineCrossEstimatorConsistency:
+    """Running the same Castle Doctrine dataset through CS and SA must
+    produce consistent direction + PT verdict. SA is a natural
+    cross-check on the CS finding.
+    """
+
+    def test_sa_agrees_with_cs_on_direction_and_pt(self, castle_panel):
+        cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+            aggregate="event_study",
+        )
+        sa = SunAbraham().fit(
+            castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        br_cs = BusinessReport(
+            cs,
+            outcome_label="Homicide rate",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        br_sa = BusinessReport(
+            sa,
+            outcome_label="Homicide rate",
+            treatment_label="Castle Doctrine law adoption",
+            outcome_direction="lower_is_better",
+            data=castle_panel,
+            outcome="homicide_rate",
+            unit="state",
+            time="year",
+            first_treat="first_treat",
+        )
+        # Direction must agree: both positive (homicides up).
+        assert br_cs.to_dict()["headline"]["sign"] == br_sa.to_dict()["headline"]["sign"]
+        assert br_cs.to_dict()["headline"]["sign"] == "positive"
+        # PT verdict must agree on the clear-violation bin (both
+        # estimators read the same underlying pre-period coefficients).
+        assert (
+            br_cs.to_dict()["pre_trends"]["verdict"]
+            == br_sa.to_dict()["pre_trends"]["verdict"]
+            == "clear_violation"
+        )
diff --git a/tests/test_business_report.py b/tests/test_business_report.py
index cda3e5a7..8ec51c2b 100644
--- a/tests/test_business_report.py
+++ b/tests/test_business_report.py
@@ -3975,6 +3975,224 @@ def test_br_rejects_both_passthrough_inputs_names_them(self):
         assert "precomputed['sensitivity']" in msg
 
 
+class TestCanonicalValidationSurfaceFixes:
+    """Regression coverage for issues surfaced by the first round of
+    BR/DR canonical-dataset validation (``docs/validation/
+    validate_br_dr_canonical.py``). Each test pins a wording bug
+    observed on a real published-applied-work fit.
+    """
+
+    def _cs_like_stub_with_zero_breakdown(self):
+        """CS-style result stub matching the Cheng-Hoekstra Castle
+        Doctrine fit pattern."""
+
+        class CallawaySantAnnaResults:
+            pass
+
+        stub = CallawaySantAnnaResults()
+        stub.overall_att = 0.5608
+        stub.overall_se = 0.1216
+        stub.overall_p_value = 0.0
+        stub.overall_conf_int = (0.323, 0.799)
+        stub.alpha = 0.05
+        stub.n_obs = 539
+        stub.n_treated = 22
+        stub.n_control_units = 27
+        stub.survey_metadata = None
+        stub.event_study_effects = None
+        stub.base_period = "universal"
+        stub.inference_method = "analytical"
+        return stub
+
+    def _fragile_dr_schema(self, breakdown_m: float, grid=None):
+        """Build a fake DiagnosticReportResults whose ``sensitivity``
+        block carries the given ``breakdown_M`` value and grid. Pass
+        ``grid`` as a list of ``{"M": float, "robust_to_zero": bool}``
+        dicts (other fields populated with plausible values).
+        """
+        from diff_diff.diagnostic_report import DiagnosticReportResults
+
+        grid = grid if grid is not None else []
+        # Populate optional CI / bound fields so grid entries match
+        # the schema the BR/DR runners actually emit.
+        grid_full = [
+            {
+                "M": row["M"],
+                "ci_lower": row.get("ci_lower", 0.0),
+                "ci_upper": row.get("ci_upper", 0.0),
+                "bound_lower": row.get("bound_lower", 0.0),
+                "bound_upper": row.get("bound_upper", 0.0),
+                "robust_to_zero": row["robust_to_zero"],
+            }
+            for row in grid
+        ]
+        schema = {
+            "schema_version": "1.0",
+            "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"},
+            "headline_metric": {},
+            "parallel_trends": {"status": "skipped", "reason": "stub"},
+            "pretrends_power": {"status": "skipped", "reason": "stub"},
+            "sensitivity": {
+                "status": "ran",
+                "method": "relative_magnitude",
+                "breakdown_M": breakdown_m,
+                "conclusion": "fragile",
+                "grid": grid_full,
+            },
+            "placebo": {"status": "skipped", "reason": "stub"},
+            "bacon": {"status": "skipped", "reason": "stub"},
+            "design_effect": {"status": "skipped", "reason": "stub"},
+            "heterogeneity": {"status": "skipped", "reason": "stub"},
+            "epv": {"status": "skipped", "reason": "stub"},
+            "estimator_native_diagnostics": {"status": "not_applicable"},
+            "skipped": {},
+            "warnings": [],
+            "overall_interpretation": "",
+            "next_steps": [],
+        }
+        return DiagnosticReportResults(
+            schema=schema,
+            interpretation="",
+            applicable_checks=("sensitivity",),
+            skipped_checks={},
+            warnings=(),
+        )
+
+    def test_treatment_label_preserves_embedded_abbreviations(self):
+        """Card-Krueger use case: ``treatment_label="the NJ minimum-wage
+        increase"`` previously rendered as ``"The nj minimum-wage
+        increase"`` because ``str.capitalize()`` lowercases every
+        character after the first. The fix preserves user-supplied
+        casing and only uppercases the first character.
+        """
+
+        class DiDResults:
+            pass
+
+        stub = DiDResults()
+        stub.att = 1.47
+        stub.se = 1.93
+        stub.t_stat = 0.76
+        stub.p_value = 0.45
+        stub.conf_int = (-2.32, 5.27)
+        stub.alpha = 0.05
+        stub.n_obs = 620
+        stub.n_treated = 462
+        stub.n_control = 158
+        stub.survey_metadata = None
+        stub.inference_method = "analytical"
+        br = BusinessReport(
+            stub,
+            outcome_label="FTE employment",
+            treatment_label="the NJ minimum-wage increase",
+            auto_diagnostics=False,
+        )
+        headline = br.headline()
+        assert "The NJ minimum-wage increase" in headline, (
+            "Embedded ``NJ`` abbreviation must survive the first-word "
+            f"capitalization. Got headline: {headline!r}"
+        )
+        assert "The nj" not in headline, (
+            "Previous capitalize() bug lowercased the NJ abbreviation. " f"Got: {headline!r}"
+        )
+
+    def test_treatment_label_preserves_proper_noun_case(self):
+        """Castle Doctrine use case: ``treatment_label="Castle Doctrine
+        law adoption"`` previously rendered as ``"Castle doctrine law
+        adoption"`` because capitalize() lowercased the rest. Must
+        preserve proper-noun casing.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        br = BusinessReport(
+            stub,
+            outcome_label="Homicide rate",
+            treatment_label="Castle Doctrine law adoption",
+            auto_diagnostics=False,
+        )
+        headline = br.headline()
+        assert (
+            "Castle Doctrine law adoption" in headline
+        ), f"Proper-noun casing must be preserved. Got: {headline!r}"
+
+    def test_smallest_grid_m_fails_uses_smallest_grid_point_wording(self):
+        """When the smallest M actually evaluated on the grid has
+        ``robust_to_zero == False``, the "smallest M evaluated" wording
+        is semantically correct. This is the Cheng-Hoekstra Castle
+        Doctrine pattern: default grid ``[0.5, 1.0, 1.5, 2.0]`` with
+        M=0.5 already non-robust.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        dr = self._fragile_dr_schema(
+            breakdown_m=0.0,
+            grid=[
+                {"M": 0.5, "robust_to_zero": False},
+                {"M": 1.0, "robust_to_zero": False},
+                {"M": 1.5, "robust_to_zero": False},
+                {"M": 2.0, "robust_to_zero": False},
+            ],
+        )
+        br = BusinessReport(stub, diagnostics=dr)
+        summary = br.summary()
+        # Must not render the degenerate multiplier form on the
+        # zero-breakdown case.
+        assert (
+            "0x the pre-period variation" not in summary
+        ), f"Summary must not quote ``0x`` multiplier. Got: {summary!r}"
+        # New wording quotes the actual smallest evaluated M.
+        assert "smallest M evaluated on the sensitivity grid" in summary, (
+            f"Summary must use the smallest-M-evaluated wording when the "
+            f"smallest grid point actually fails. Got: {summary!r}"
+        )
+        assert "M = 0.5" in summary
+
+    def test_smallest_grid_m_robust_falls_through_to_multiplier_wording(self):
+        """CI review on PR #341 R1: ``breakdown_M`` is the interpolated
+        threshold between grid points, not a claim about any specific
+        grid point. On a grid starting at M=0 where the smallest
+        evaluated point is still robust, a small ``breakdown_M=0.03``
+        does NOT mean the smallest grid point failed — it means
+        fragility emerges between grid points. The correct wording is
+        the numeric multiplier, not the smallest-grid-point claim.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        dr = self._fragile_dr_schema(
+            breakdown_m=0.03,
+            grid=[
+                # Smallest evaluated M (0) is still robust: CI excludes
+                # zero. Breakdown is interpolated somewhere between M=0
+                # and M=0.25.
+                {"M": 0.0, "robust_to_zero": True},
+                {"M": 0.25, "robust_to_zero": False},
+                {"M": 0.5, "robust_to_zero": False},
+            ],
+        )
+        br = BusinessReport(stub, diagnostics=dr)
+        summary = br.summary()
+        # Must NOT claim the smallest grid point failed — it didn't.
+        assert "smallest M evaluated on the sensitivity grid" not in summary, (
+            f"Summary must not assert ``smallest M evaluated fails`` when the "
+            f"smallest grid point is still robust. Got: {summary!r}"
+        )
+        # Correct wording quotes the numeric multiplier.
+        assert "0.03x" in summary, (
+            f"Fragile fit with robust smallest-M should quote the interpolated "
+            f"breakdown multiplier. Got: {summary!r}"
+        )
+
+    def test_breakdown_m_normal_keeps_multiplier_wording(self):
+        """Breakdown values at the usual fragile-but-nonzero range
+        (e.g., 0.3) must still quote the ``0.3x`` multiplier — the
+        smallest-M-evaluated wording is only for grids whose smallest
+        actually-evaluated point is already non-robust.
+        """
+        stub = self._cs_like_stub_with_zero_breakdown()
+        dr = self._fragile_dr_schema(breakdown_m=0.3)
+        br = BusinessReport(stub, diagnostics=dr)
+        summary = br.summary()
+        assert "0.3x" in summary
+        assert "smallest M evaluated on the sensitivity grid" not in summary
+
+
 class TestBaconCaveatEstimatorAware:
     """Round-45 P1 CI review on PR #318: Goodman-Bacon decomposes TWFE
     weights. On fits already produced by a heterogeneity-robust
diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py
index 2f3800a6..ba473a56 100644
--- a/tests/test_diagnostic_report.py
+++ b/tests/test_diagnostic_report.py
@@ -1891,6 +1891,121 @@ def test_full_report_has_headers(self, cs_fit):
         assert "## HonestDiD sensitivity" in md
 
 
+class TestDRFragilePhrasingIsGridAware:
+    """CI review on PR #341 R1: DR's ``overall_interpretation``
+    fragile-sensitivity sentence must be gated on the actual
+    evaluated grid, not just on ``breakdown_M``. ``breakdown_M`` is
+    the interpolated threshold between grid points; "smallest grid
+    point fails" is only a valid claim when the smallest actually-
+    evaluated M has ``robust_to_zero == False``. Mirrors the BR test
+    class ``TestCanonicalValidationSurfaceFixes``.
+    """
+
+    @staticmethod
+    def _grid(breakdown_m, grid_rows):
+        """Build a sensitivity block with a populated grid, matching
+        the schema ``_check_sensitivity`` emits."""
+        return {
+            "status": "ran",
+            "method": "relative_magnitude",
+            "breakdown_M": breakdown_m,
+            "conclusion": "fragile",
+            "grid": [
+                {
+                    "M": row["M"],
+                    "ci_lower": 0.0,
+                    "ci_upper": 0.0,
+                    "bound_lower": 0.0,
+                    "bound_upper": 0.0,
+                    "robust_to_zero": row["robust_to_zero"],
+                }
+                for row in grid_rows
+            ],
+        }
+
+    def _render(self, sens_block):
+        """Call the DR overall-interpretation renderer on a minimal
+        schema that has our sensitivity block and otherwise skipped
+        sections (so the fragile-sensitivity branch fires alone)."""
+        from diff_diff.diagnostic_report import _render_overall_interpretation
+
+        schema = {
+            "schema_version": "1.0",
+            "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"},
+            "headline_metric": {
+                "status": "ran",
+                "effect": 0.5,
+                "se": 0.1,
+                "p_value": 0.0,
+                "ci_lower": 0.3,
+                "ci_upper": 0.7,
+                "is_significant": True,
+                "sign": "positive",
+                "alpha": 0.05,
+            },
+            "parallel_trends": {"status": "skipped", "reason": "stub"},
+            "pretrends_power": {"status": "skipped", "reason": "stub"},
+            "sensitivity": sens_block,
+            "placebo": {"status": "skipped", "reason": "stub"},
+            "bacon": {"status": "skipped", "reason": "stub"},
+            "design_effect": {"status": "skipped", "reason": "stub"},
+            "heterogeneity": {"status": "skipped", "reason": "stub"},
+            "epv": {"status": "skipped", "reason": "stub"},
+            "estimator_native_diagnostics": {"status": "not_applicable"},
+            "skipped": {},
+            "warnings": [],
+            "next_steps": [],
+        }
+        return _render_overall_interpretation(schema, {})
+
+    def test_dr_smallest_grid_m_fails_uses_smallest_m_wording(self):
+        """Castle Doctrine pattern: grid ``[0.5, 1.0, ...]`` with M=0.5
+        already non-robust. DR emits "smallest M evaluated (M = 0.5)".
+        """
+        sens = self._grid(
+            breakdown_m=0.0,
+            grid_rows=[
+                {"M": 0.5, "robust_to_zero": False},
+                {"M": 1.0, "robust_to_zero": False},
+            ],
+        )
+        prose = self._render(sens)
+        assert "smallest M evaluated on the sensitivity grid" in prose
+        assert "M = 0.5" in prose
+        assert "0x the pre-period variation" not in prose
+
+    def test_dr_smallest_grid_m_robust_falls_through_to_multiplier(self):
+        """Grid starting at M=0 with smallest point still robust.
+        ``breakdown_M=0.03`` is the interpolated threshold between
+        M=0 and M=0.25; DR must NOT claim the smallest grid point
+        failed (it didn't) and must use the multiplier wording
+        instead.
+        """
+        sens = self._grid(
+            breakdown_m=0.03,
+            grid_rows=[
+                {"M": 0.0, "robust_to_zero": True},
+                {"M": 0.25, "robust_to_zero": False},
+            ],
+        )
+        prose = self._render(sens)
+        assert "smallest M evaluated on the sensitivity grid" not in prose
+        assert "0.03x" in prose
+
+    def test_dr_normal_fragile_keeps_multiplier(self):
+        """Normal fragile value (e.g., 0.3) still quotes the multiplier."""
+        sens = self._grid(
+            breakdown_m=0.3,
+            grid_rows=[
+                {"M": 0.5, "robust_to_zero": True},
+                {"M": 1.0, "robust_to_zero": True},
+            ],
+        )
+        prose = self._render(sens)
+        assert "0.3x" in prose
+        assert "smallest M evaluated on the sensitivity grid" not in prose
+
+
 # ---------------------------------------------------------------------------
 # Public result class
 # ---------------------------------------------------------------------------