diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 2445251f..36f19780 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1854,6 +1854,48 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str: return "the confidence interval includes zero; the data are consistent with no effect" +def _smallest_failing_grid_m(sens: Dict[str, Any]) -> Optional[float]: + """If the smallest evaluated M on the HonestDiD sensitivity grid + already has the robust CI including zero, return that M. Returns + ``None`` when the grid is missing or when the smallest evaluated + point is still robust — in the latter case ``breakdown_M`` is an + interpolated threshold between grid points, not a statement about + the smallest grid point itself. + + Matches the twin helper in ``diagnostic_report.py``; keep the two + in sync for cross-surface parity. + """ + grid_points = sens.get("grid") or [] + sorted_grid = sorted( + (p for p in grid_points if isinstance(p.get("M"), (int, float))), + key=lambda p: p["M"], + ) + if not sorted_grid: + return None + smallest = sorted_grid[0] + if not smallest.get("robust_to_zero", True): + return float(smallest["M"]) + return None + + +def _sentence_first_upper(text: str) -> str: + """Uppercase only the first character of ``text``, preserving all + other casing. Unlike ``str.capitalize()``, which lowercases every + character after the first, this keeps user-supplied abbreviations + and proper nouns intact. + + Examples + -------- + >>> _sentence_first_upper("the NJ minimum-wage increase") + 'The NJ minimum-wage increase' + >>> _sentence_first_upper("Castle Doctrine law adoption") + 'Castle Doctrine law adoption' + """ + if not text: + return text + return text[0].upper() + text[1:] + + def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str: """Return a direction-aware verb for the headline sentence. @@ -1929,7 +1971,16 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str: # is not actually available. ci_str = " (inference unavailable: confidence interval is undefined for this fit)" by_clause = f" by {magnitude}" if effect != 0 else "" - return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}." + # Round-1 BR/DR canonical-validation (2026-04-19): Python's + # ``str.capitalize()`` lowercases everything except the first + # character, so ``"the NJ minimum-wage increase".capitalize()`` + # returns ``"The nj minimum-wage increase"`` — flattening the + # ``NJ`` abbreviation. Real canonical datasets (Card-Krueger, + # Castle Doctrine) carry proper-noun / acronym tokens in the + # user-supplied ``treatment_label``, so preserve user casing and + # only ensure the first character is uppercase. + treatment_sentence = _sentence_first_upper(treatment) + return f"{treatment_sentence} {verb} {outcome}{by_clause}{ci_str}." def _render_summary(schema: Dict[str, Any]) -> str: @@ -2088,11 +2139,33 @@ def _render_summary(schema: Dict[str, Any]) -> str: f"pre-period variation." ) elif isinstance(bkd, (int, float)): - sentences.append( - f"HonestDiD: the result is fragile — the confidence interval " - f"includes zero once violations reach {bkd:.2g}x the " - f"pre-period variation." - ) + # Round-1 BR/DR canonical-validation (2026-04-19) then + # tightened per CI review on PR #341 R1: + # ``breakdown_M`` is the smallest M at which the robust + # CI includes zero (interpolated between grid points) — + # not a claim about any specific grid point. Earlier fix + # keyed off ``bkd <= 0.05`` which incorrectly asserted + # "smallest grid point fails" even for grids that start + # at M=0 where the smallest evaluated point is still + # robust (e.g., grid=[0, 0.25, ...] with bkd=0.03). The + # "smallest grid point" wording is only accurate when + # the smallest evaluated M on the grid itself fails + # (``robust_to_zero == False``); otherwise fall through + # to the numeric multiplier. + smallest_failed_m = _smallest_failing_grid_m(sens) + if smallest_failed_m is not None: + sentences.append( + "HonestDiD: the result is fragile — the confidence " + "interval includes zero even at the smallest M " + f"evaluated on the sensitivity grid (M = " + f"{smallest_failed_m:.2g})." + ) + else: + sentences.append( + f"HonestDiD: the result is fragile — the confidence " + f"interval includes zero once violations reach {bkd:.2g}x " + f"the pre-period variation." + ) # Sample sentence. For fits with a dynamic comparison set (CS / # ContinuousDiD / StaggeredTripleDiff / EfficientDiD / diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 0fe798a9..6ec87b2a 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -2780,6 +2780,32 @@ def _collect_pre_period_coefs( return results_list, n_dropped_undefined +def _smallest_failing_grid_m_dr(sens: Dict[str, Any]) -> Optional[float]: + """Return the smallest evaluated M on the HonestDiD sensitivity + grid if it already has the robust CI including zero, else ``None``. + Matches ``business_report._smallest_failing_grid_m`` — both helpers + must stay in sync for cross-surface parity. See PR #341 R1 review. + + ``breakdown_M`` is an interpolated threshold between grid points, + so "the smallest grid point fails" is only a valid claim when the + smallest actually-evaluated M has ``robust_to_zero == False``. On + a grid that starts at M=0 where the smallest evaluated point is + still robust, the breakdown value is information about what + happens between grid points — not at the smallest grid point. + """ + grid_points = sens.get("grid") or [] + sorted_grid = sorted( + (p for p in grid_points if isinstance(p.get("M"), (int, float))), + key=lambda p: p["M"], + ) + if not sorted_grid: + return None + smallest = sorted_grid[0] + if not smallest.get("robust_to_zero", True): + return float(smallest["M"]) + return None + + def _pt_verdict(p: Optional[float]) -> str: """Map a pre-trends joint p-value to the three-bin verdict enum. @@ -3118,13 +3144,33 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str f"pre-period variation." ) else: - sentences.append( - f"HonestDiD sensitivity: the result is fragile — the " - f"confidence interval includes zero once violations reach " - f"{bkd:.2g}x the pre-period variation." - if isinstance(bkd, (int, float)) - else "" - ) + # Round-1 BR/DR canonical-validation (2026-04-19) then + # tightened per CI review on PR #341 R1: the "smallest + # grid point" wording is only semantically correct when + # the smallest M actually evaluated on the sensitivity + # grid has ``robust_to_zero == False``. ``breakdown_M`` + # is the interpolated threshold between grid points, so + # a small breakdown value on a grid starting at M=0 + # (where the smallest evaluated point is still robust) + # would previously have been narrated as "smallest grid + # point fails" — stronger than the evaluated grid + # supports. Mirror BR's fix: check the grid directly. + if isinstance(bkd, (int, float)): + smallest_failed_m = _smallest_failing_grid_m_dr(sens) + if smallest_failed_m is not None: + sentences.append( + "HonestDiD sensitivity: the result is fragile — " + "the confidence interval includes zero even at " + "the smallest M evaluated on the sensitivity " + f"grid (M = {smallest_failed_m:.2g})." + ) + else: + sentences.append( + f"HonestDiD sensitivity: the result is fragile — " + f"the confidence interval includes zero once " + f"violations reach {bkd:.2g}x the pre-period " + f"variation." + ) # Sentence 4: one secondary caveat if present. bacon = schema.get("bacon") or {} diff --git a/tests/test_br_dr_canonical_datasets.py b/tests/test_br_dr_canonical_datasets.py new file mode 100644 index 00000000..10998a61 --- /dev/null +++ b/tests/test_br_dr_canonical_datasets.py @@ -0,0 +1,361 @@ +"""Canonical-dataset regression guards for BusinessReport / DiagnosticReport. + +Closes BR/DR foundation gap #4 (real-dataset validation): the risk was +that BR/DR's prose could silently diverge from canonical interpretations +of applied work without synthetic-DGP tests catching it. These tests +run BR on four canonical fits and assert direction / verdict / tier +properties that should hold regardless of small data-aggregation +differences between the bundled dataset and the published author +sample. + +Assertions are property-level, not exact-match: +- Sign of the point estimate. +- Whether the CI includes zero. +- Pre-trends verdict bin (``no_detected_violation`` vs + ``clear_violation``). +- HonestDiD sensitivity tier (robust vs fragile, via ``breakdown_M``). +- Cross-estimator consistency (CS and SA produce the same direction + and verdict on the same data). + +These tests use the ``_construct_*`` fallback data from +``diff_diff.datasets`` to avoid network dependency in CI. The +construction targets match the published summary statistics, so +canonical-direction / canonical-verdict properties hold. +""" + +from __future__ import annotations + +import warnings + +import pytest + +from diff_diff import ( + BusinessReport, + CallawaySantAnna, + DifferenceInDifferences, + SunAbraham, +) +from diff_diff.datasets import ( + _construct_card_krueger_data, + _construct_castle_doctrine_data, + _construct_mpdta_data, +) + + +@pytest.fixture(scope="module") +def card_krueger_long(): + """Card-Krueger dataset reshaped wide -> long for DiD fitting.""" + warnings.filterwarnings("ignore") + ck = _construct_card_krueger_data() + ck_long = ck.melt( + id_vars=["store_id", "state", "treated"], + value_vars=["emp_pre", "emp_post"], + var_name="period", + value_name="employment", + ) + ck_long["post"] = (ck_long["period"] == "emp_post").astype(int) + return ck_long + + +@pytest.fixture(scope="module") +def mpdta_panel(): + """Callaway-Sant'Anna benchmark (mpdta) as constructed by the fallback.""" + warnings.filterwarnings("ignore") + return _construct_mpdta_data() + + +@pytest.fixture(scope="module") +def castle_panel(): + """Cheng-Hoekstra Castle Doctrine dataset as constructed by the fallback.""" + warnings.filterwarnings("ignore") + return _construct_castle_doctrine_data() + + +class TestCardKruegerCanonicalDirection: + """Card & Krueger (1994): NJ minimum-wage increase vs PA control. + + Canonical finding: no significant disemployment effect; published + ATT is positive (~+0.59 FTE per store) but the CI includes zero. + """ + + def test_no_significant_disemployment(self, card_krueger_long): + did = DifferenceInDifferences().fit( + card_krueger_long, + outcome="employment", + treatment="treated", + time="post", + ) + br = BusinessReport( + did, + outcome_label="FTE employment", + outcome_unit="FTE", + treatment_label="the NJ minimum-wage increase", + outcome_direction="higher_is_better", + auto_diagnostics=False, + ) + h = br.to_dict()["headline"] + # Canonical: positive sign (no disemployment, if anything a + # small positive lift). + assert h["sign"] == "positive", ( + f"Card-Krueger canonical finding is a positive ATT; got " + f"sign={h['sign']!r}, effect={h['effect']!r}" + ) + # Canonical: CI includes zero -> not statistically significant. + assert h["ci_lower"] < 0 < h["ci_upper"], ( + f"Card-Krueger canonical finding is CI includes zero; got " + f"[{h['ci_lower']}, {h['ci_upper']}]" + ) + assert h["is_significant"] is False + # BR prose must name this in stakeholder-readable language. + summary = br.summary().lower() + assert "consistent with no effect" in summary, ( + f"BR summary must report 'consistent with no effect' on " + f"Card-Krueger. Got: {summary!r}" + ) + + def test_treatment_label_abbreviation_preserved(self, card_krueger_long): + """The ``NJ`` abbreviation in the treatment label must survive + BR's sentence capitalization (regression for the + ``str.capitalize()`` bug surfaced by this dataset). + """ + did = DifferenceInDifferences().fit( + card_krueger_long, + outcome="employment", + treatment="treated", + time="post", + ) + br = BusinessReport( + did, + outcome_label="FTE employment", + treatment_label="the NJ minimum-wage increase", + auto_diagnostics=False, + ) + assert "The NJ minimum-wage increase" in br.headline() + + +class TestMpdtaCanonicalDirection: + """Callaway-Sant'Anna benchmark (mpdta): staggered minimum-wage + increases, log employment outcome. + + Canonical finding: aggregate ATT is negative; the published fit is + robust under HonestDiD sensitivity; pre-trends do not reject + parallel trends. + """ + + def test_negative_att_robust_sensitivity_clean_pretrends(self, mpdta_panel): + cs = CallawaySantAnna(base_period="universal").fit( + mpdta_panel, + outcome="lemp", + unit="countyreal", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Log employment", + outcome_unit="log_points", + treatment_label="the state-level minimum wage increase", + outcome_direction="higher_is_better", + data=mpdta_panel, + outcome="lemp", + unit="countyreal", + time="year", + first_treat="first_treat", + ) + d = br.to_dict() + h = d["headline"] + # Canonical direction: ATT on log employment is negative. + assert ( + h["sign"] == "negative" + ), f"mpdta canonical finding is negative ATT; got sign={h['sign']!r}" + # Canonical robustness: HonestDiD breakdown M > 1 means the + # result survives violations at least as large as the observed + # pre-period variation. + bkd = h.get("breakdown_M") + assert isinstance(bkd, (int, float)) and bkd > 1.0, ( + f"mpdta canonical finding is robust sensitivity " + f"(breakdown_M > 1.0); got breakdown_M={bkd!r}" + ) + # Canonical pre-trends: do not reject PT. + pt = d["pre_trends"] + assert pt.get("verdict") == "no_detected_violation", ( + f"mpdta canonical finding is clean pre-trends " + f"(no_detected_violation); got verdict={pt.get('verdict')!r}" + ) + + +class TestCastleDoctrineCanonicalDirection: + """Cheng & Hoekstra (2013): Castle Doctrine / Stand Your Ground + laws staggered across U.S. states. + + Canonical finding: ~8% INCREASE in homicide rates (no deterrent + effect; if anything, escalation). Pre-trends violation is a + well-known issue with this dataset; HonestDiD sensitivity + flags the headline as fragile. + """ + + def test_cs_positive_att_clear_violation_fragile_sensitivity(self, castle_panel): + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Homicide rate (per 100k)", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + d = br.to_dict() + h = d["headline"] + # Canonical direction: homicides went UP (positive ATT). + assert h["sign"] == "positive", ( + f"Castle Doctrine canonical finding is positive ATT (homicide " + f"escalation); got sign={h['sign']!r}" + ) + # Canonical: clear PT violation on this dataset. + pt = d["pre_trends"] + assert pt.get("verdict") == "clear_violation", ( + f"Castle Doctrine canonical finding is clear PT violation; " + f"got verdict={pt.get('verdict')!r}" + ) + # Canonical: HonestDiD flags fragility given the PT violation. + sens = d["sensitivity"] + assert sens.get("status") == "computed" + bkd = sens.get("breakdown_M") + assert isinstance(bkd, (int, float)) and bkd < 0.5, ( + f"Castle Doctrine canonical finding is fragile sensitivity " + f"(breakdown_M < 0.5); got breakdown_M={bkd!r}" + ) + + def test_treatment_label_proper_noun_preserved(self, castle_panel): + """ "Castle Doctrine" must survive BR's sentence capitalization + (regression for the ``str.capitalize()`` bug surfaced by this + dataset). + """ + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Homicide rate (per 100k)", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + auto_diagnostics=False, + ) + assert "Castle Doctrine law adoption" in br.headline() + + def test_breakdown_m_zero_uses_smallest_m_evaluated_wording(self, castle_panel): + """Castle Doctrine's fragile sensitivity surfaced a + ``breakdown_M == 0`` edge case. The default HonestDiD grid + starts at M=0.5, and every grid point has the robust CI + including zero — so the smallest-M-evaluated wording is + semantically accurate here. BR's summary must say ``smallest + M evaluated on the sensitivity grid (M = 0.5)`` and must not + quote the degenerate ``0x`` multiplier. + """ + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport( + cs, + outcome_label="Homicide rate (per 100k)", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + summary = br.summary() + bkd = br.to_dict()["headline"].get("breakdown_M") + # Sanity: this dataset actually produces the edge case. + assert isinstance(bkd, (int, float)) and bkd <= 0.05, ( + f"This test assumes Castle Doctrine + CS produces " + f"breakdown_M <= 0.05; if not, the dataset or estimator " + f"changed. Got breakdown_M={bkd!r}" + ) + # Must not render the degenerate ``0x`` multiplier. + assert "0x the pre-period variation" not in summary, ( + f"Summary must not quote ``0x`` multiplier on edge-case " f"breakdown. Got: {summary!r}" + ) + # Must name the smallest evaluated grid point (0.5 for the + # default grid). + assert "smallest M evaluated on the sensitivity grid" in summary + assert "M = 0.5" in summary + + +class TestCastleDoctrineCrossEstimatorConsistency: + """Running the same Castle Doctrine dataset through CS and SA must + produce consistent direction + PT verdict. SA is a natural + cross-check on the CS finding. + """ + + def test_sa_agrees_with_cs_on_direction_and_pt(self, castle_panel): + cs = CallawaySantAnna(base_period="universal", control_group="never_treated").fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + aggregate="event_study", + ) + sa = SunAbraham().fit( + castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + br_cs = BusinessReport( + cs, + outcome_label="Homicide rate", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + br_sa = BusinessReport( + sa, + outcome_label="Homicide rate", + treatment_label="Castle Doctrine law adoption", + outcome_direction="lower_is_better", + data=castle_panel, + outcome="homicide_rate", + unit="state", + time="year", + first_treat="first_treat", + ) + # Direction must agree: both positive (homicides up). + assert br_cs.to_dict()["headline"]["sign"] == br_sa.to_dict()["headline"]["sign"] + assert br_cs.to_dict()["headline"]["sign"] == "positive" + # PT verdict must agree on the clear-violation bin (both + # estimators read the same underlying pre-period coefficients). + assert ( + br_cs.to_dict()["pre_trends"]["verdict"] + == br_sa.to_dict()["pre_trends"]["verdict"] + == "clear_violation" + ) diff --git a/tests/test_business_report.py b/tests/test_business_report.py index cda3e5a7..8ec51c2b 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -3975,6 +3975,224 @@ def test_br_rejects_both_passthrough_inputs_names_them(self): assert "precomputed['sensitivity']" in msg +class TestCanonicalValidationSurfaceFixes: + """Regression coverage for issues surfaced by the first round of + BR/DR canonical-dataset validation (``docs/validation/ + validate_br_dr_canonical.py``). Each test pins a wording bug + observed on a real published-applied-work fit. + """ + + def _cs_like_stub_with_zero_breakdown(self): + """CS-style result stub matching the Cheng-Hoekstra Castle + Doctrine fit pattern.""" + + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 0.5608 + stub.overall_se = 0.1216 + stub.overall_p_value = 0.0 + stub.overall_conf_int = (0.323, 0.799) + stub.alpha = 0.05 + stub.n_obs = 539 + stub.n_treated = 22 + stub.n_control_units = 27 + stub.survey_metadata = None + stub.event_study_effects = None + stub.base_period = "universal" + stub.inference_method = "analytical" + return stub + + def _fragile_dr_schema(self, breakdown_m: float, grid=None): + """Build a fake DiagnosticReportResults whose ``sensitivity`` + block carries the given ``breakdown_M`` value and grid. Pass + ``grid`` as a list of ``{"M": float, "robust_to_zero": bool}`` + dicts (other fields populated with plausible values). + """ + from diff_diff.diagnostic_report import DiagnosticReportResults + + grid = grid if grid is not None else [] + # Populate optional CI / bound fields so grid entries match + # the schema the BR/DR runners actually emit. + grid_full = [ + { + "M": row["M"], + "ci_lower": row.get("ci_lower", 0.0), + "ci_upper": row.get("ci_upper", 0.0), + "bound_lower": row.get("bound_lower", 0.0), + "bound_upper": row.get("bound_upper", 0.0), + "robust_to_zero": row["robust_to_zero"], + } + for row in grid + ] + schema = { + "schema_version": "1.0", + "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"}, + "headline_metric": {}, + "parallel_trends": {"status": "skipped", "reason": "stub"}, + "pretrends_power": {"status": "skipped", "reason": "stub"}, + "sensitivity": { + "status": "ran", + "method": "relative_magnitude", + "breakdown_M": breakdown_m, + "conclusion": "fragile", + "grid": grid_full, + }, + "placebo": {"status": "skipped", "reason": "stub"}, + "bacon": {"status": "skipped", "reason": "stub"}, + "design_effect": {"status": "skipped", "reason": "stub"}, + "heterogeneity": {"status": "skipped", "reason": "stub"}, + "epv": {"status": "skipped", "reason": "stub"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "overall_interpretation": "", + "next_steps": [], + } + return DiagnosticReportResults( + schema=schema, + interpretation="", + applicable_checks=("sensitivity",), + skipped_checks={}, + warnings=(), + ) + + def test_treatment_label_preserves_embedded_abbreviations(self): + """Card-Krueger use case: ``treatment_label="the NJ minimum-wage + increase"`` previously rendered as ``"The nj minimum-wage + increase"`` because ``str.capitalize()`` lowercases every + character after the first. The fix preserves user-supplied + casing and only uppercases the first character. + """ + + class DiDResults: + pass + + stub = DiDResults() + stub.att = 1.47 + stub.se = 1.93 + stub.t_stat = 0.76 + stub.p_value = 0.45 + stub.conf_int = (-2.32, 5.27) + stub.alpha = 0.05 + stub.n_obs = 620 + stub.n_treated = 462 + stub.n_control = 158 + stub.survey_metadata = None + stub.inference_method = "analytical" + br = BusinessReport( + stub, + outcome_label="FTE employment", + treatment_label="the NJ minimum-wage increase", + auto_diagnostics=False, + ) + headline = br.headline() + assert "The NJ minimum-wage increase" in headline, ( + "Embedded ``NJ`` abbreviation must survive the first-word " + f"capitalization. Got headline: {headline!r}" + ) + assert "The nj" not in headline, ( + "Previous capitalize() bug lowercased the NJ abbreviation. " f"Got: {headline!r}" + ) + + def test_treatment_label_preserves_proper_noun_case(self): + """Castle Doctrine use case: ``treatment_label="Castle Doctrine + law adoption"`` previously rendered as ``"Castle doctrine law + adoption"`` because capitalize() lowercased the rest. Must + preserve proper-noun casing. + """ + stub = self._cs_like_stub_with_zero_breakdown() + br = BusinessReport( + stub, + outcome_label="Homicide rate", + treatment_label="Castle Doctrine law adoption", + auto_diagnostics=False, + ) + headline = br.headline() + assert ( + "Castle Doctrine law adoption" in headline + ), f"Proper-noun casing must be preserved. Got: {headline!r}" + + def test_smallest_grid_m_fails_uses_smallest_grid_point_wording(self): + """When the smallest M actually evaluated on the grid has + ``robust_to_zero == False``, the "smallest M evaluated" wording + is semantically correct. This is the Cheng-Hoekstra Castle + Doctrine pattern: default grid ``[0.5, 1.0, 1.5, 2.0]`` with + M=0.5 already non-robust. + """ + stub = self._cs_like_stub_with_zero_breakdown() + dr = self._fragile_dr_schema( + breakdown_m=0.0, + grid=[ + {"M": 0.5, "robust_to_zero": False}, + {"M": 1.0, "robust_to_zero": False}, + {"M": 1.5, "robust_to_zero": False}, + {"M": 2.0, "robust_to_zero": False}, + ], + ) + br = BusinessReport(stub, diagnostics=dr) + summary = br.summary() + # Must not render the degenerate multiplier form on the + # zero-breakdown case. + assert ( + "0x the pre-period variation" not in summary + ), f"Summary must not quote ``0x`` multiplier. Got: {summary!r}" + # New wording quotes the actual smallest evaluated M. + assert "smallest M evaluated on the sensitivity grid" in summary, ( + f"Summary must use the smallest-M-evaluated wording when the " + f"smallest grid point actually fails. Got: {summary!r}" + ) + assert "M = 0.5" in summary + + def test_smallest_grid_m_robust_falls_through_to_multiplier_wording(self): + """CI review on PR #341 R1: ``breakdown_M`` is the interpolated + threshold between grid points, not a claim about any specific + grid point. On a grid starting at M=0 where the smallest + evaluated point is still robust, a small ``breakdown_M=0.03`` + does NOT mean the smallest grid point failed — it means + fragility emerges between grid points. The correct wording is + the numeric multiplier, not the smallest-grid-point claim. + """ + stub = self._cs_like_stub_with_zero_breakdown() + dr = self._fragile_dr_schema( + breakdown_m=0.03, + grid=[ + # Smallest evaluated M (0) is still robust: CI excludes + # zero. Breakdown is interpolated somewhere between M=0 + # and M=0.25. + {"M": 0.0, "robust_to_zero": True}, + {"M": 0.25, "robust_to_zero": False}, + {"M": 0.5, "robust_to_zero": False}, + ], + ) + br = BusinessReport(stub, diagnostics=dr) + summary = br.summary() + # Must NOT claim the smallest grid point failed — it didn't. + assert "smallest M evaluated on the sensitivity grid" not in summary, ( + f"Summary must not assert ``smallest M evaluated fails`` when the " + f"smallest grid point is still robust. Got: {summary!r}" + ) + # Correct wording quotes the numeric multiplier. + assert "0.03x" in summary, ( + f"Fragile fit with robust smallest-M should quote the interpolated " + f"breakdown multiplier. Got: {summary!r}" + ) + + def test_breakdown_m_normal_keeps_multiplier_wording(self): + """Breakdown values at the usual fragile-but-nonzero range + (e.g., 0.3) must still quote the ``0.3x`` multiplier — the + smallest-M-evaluated wording is only for grids whose smallest + actually-evaluated point is already non-robust. + """ + stub = self._cs_like_stub_with_zero_breakdown() + dr = self._fragile_dr_schema(breakdown_m=0.3) + br = BusinessReport(stub, diagnostics=dr) + summary = br.summary() + assert "0.3x" in summary + assert "smallest M evaluated on the sensitivity grid" not in summary + + class TestBaconCaveatEstimatorAware: """Round-45 P1 CI review on PR #318: Goodman-Bacon decomposes TWFE weights. On fits already produced by a heterogeneity-robust diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 2f3800a6..ba473a56 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1891,6 +1891,121 @@ def test_full_report_has_headers(self, cs_fit): assert "## HonestDiD sensitivity" in md +class TestDRFragilePhrasingIsGridAware: + """CI review on PR #341 R1: DR's ``overall_interpretation`` + fragile-sensitivity sentence must be gated on the actual + evaluated grid, not just on ``breakdown_M``. ``breakdown_M`` is + the interpolated threshold between grid points; "smallest grid + point fails" is only a valid claim when the smallest actually- + evaluated M has ``robust_to_zero == False``. Mirrors the BR test + class ``TestCanonicalValidationSurfaceFixes``. + """ + + @staticmethod + def _grid(breakdown_m, grid_rows): + """Build a sensitivity block with a populated grid, matching + the schema ``_check_sensitivity`` emits.""" + return { + "status": "ran", + "method": "relative_magnitude", + "breakdown_M": breakdown_m, + "conclusion": "fragile", + "grid": [ + { + "M": row["M"], + "ci_lower": 0.0, + "ci_upper": 0.0, + "bound_lower": 0.0, + "bound_upper": 0.0, + "robust_to_zero": row["robust_to_zero"], + } + for row in grid_rows + ], + } + + def _render(self, sens_block): + """Call the DR overall-interpretation renderer on a minimal + schema that has our sensitivity block and otherwise skipped + sections (so the fragile-sensitivity branch fires alone).""" + from diff_diff.diagnostic_report import _render_overall_interpretation + + schema = { + "schema_version": "1.0", + "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"}, + "headline_metric": { + "status": "ran", + "effect": 0.5, + "se": 0.1, + "p_value": 0.0, + "ci_lower": 0.3, + "ci_upper": 0.7, + "is_significant": True, + "sign": "positive", + "alpha": 0.05, + }, + "parallel_trends": {"status": "skipped", "reason": "stub"}, + "pretrends_power": {"status": "skipped", "reason": "stub"}, + "sensitivity": sens_block, + "placebo": {"status": "skipped", "reason": "stub"}, + "bacon": {"status": "skipped", "reason": "stub"}, + "design_effect": {"status": "skipped", "reason": "stub"}, + "heterogeneity": {"status": "skipped", "reason": "stub"}, + "epv": {"status": "skipped", "reason": "stub"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "next_steps": [], + } + return _render_overall_interpretation(schema, {}) + + def test_dr_smallest_grid_m_fails_uses_smallest_m_wording(self): + """Castle Doctrine pattern: grid ``[0.5, 1.0, ...]`` with M=0.5 + already non-robust. DR emits "smallest M evaluated (M = 0.5)". + """ + sens = self._grid( + breakdown_m=0.0, + grid_rows=[ + {"M": 0.5, "robust_to_zero": False}, + {"M": 1.0, "robust_to_zero": False}, + ], + ) + prose = self._render(sens) + assert "smallest M evaluated on the sensitivity grid" in prose + assert "M = 0.5" in prose + assert "0x the pre-period variation" not in prose + + def test_dr_smallest_grid_m_robust_falls_through_to_multiplier(self): + """Grid starting at M=0 with smallest point still robust. + ``breakdown_M=0.03`` is the interpolated threshold between + M=0 and M=0.25; DR must NOT claim the smallest grid point + failed (it didn't) and must use the multiplier wording + instead. + """ + sens = self._grid( + breakdown_m=0.03, + grid_rows=[ + {"M": 0.0, "robust_to_zero": True}, + {"M": 0.25, "robust_to_zero": False}, + ], + ) + prose = self._render(sens) + assert "smallest M evaluated on the sensitivity grid" not in prose + assert "0.03x" in prose + + def test_dr_normal_fragile_keeps_multiplier(self): + """Normal fragile value (e.g., 0.3) still quotes the multiplier.""" + sens = self._grid( + breakdown_m=0.3, + grid_rows=[ + {"M": 0.5, "robust_to_zero": True}, + {"M": 1.0, "robust_to_zero": True}, + ], + ) + prose = self._render(sens) + assert "0.3x" in prose + assert "smallest M evaluated on the sensitivity grid" not in prose + + # --------------------------------------------------------------------------- # Public result class # ---------------------------------------------------------------------------