diff --git a/TODO.md b/TODO.md
index ab8ec9bc..0ec19d0d 100644
--- a/TODO.md
+++ b/TODO.md
@@ -95,7 +95,11 @@ Deferred items from PR reviews that were not addressed before merge.
 | `HeterogeneousAdoptionDiD`: `weights=` support. Deferred jointly with survey integration. nprobust's `lprobust` has no weight argument so the nonparametric continuous path needs a derivation; the 2SLS mass-point path needs weighted-sandwich parity. | `diff_diff/had.py` | Phase 2a | Medium |
 | `HeterogeneousAdoptionDiD` mass-point: `vcov_type in {"hc2", "hc2_bm"}` raises `NotImplementedError` pending a 2SLS-specific leverage derivation. The OLS leverage `x_i' (X'X)^{-1} x_i` is wrong for 2SLS; the correct finite-sample correction uses `x_i' (Z'X)^{-1} (...) (X'Z)^{-1} x_i`. Needs derivation plus an R / Stata (`ivreg2 small robust`) parity anchor. | `diff_diff/had.py::_fit_mass_point_2sls` | Phase 2a | Medium |
 | `HeterogeneousAdoptionDiD` continuous paths: thread `cluster=` through `bias_corrected_local_linear` (Phase 1c's wrapper already supports cluster; Phase 2a ignores it with a `UserWarning` on the continuous path to keep scope tight). | `diff_diff/had.py`, `diff_diff/local_linear.py` | Phase 2a | Low |
-| `HeterogeneousAdoptionDiD` Phase 3: `qug_test()`, `stute_test()`, `yatchew_hr_test()` pre-test diagnostics (paper Section 3.3). Composite helper `did_had_pretest_workflow()`. Not part of Phase 2a scope. | `diff_diff/had.py`, new module | Phase 2a | Medium |
+| `HeterogeneousAdoptionDiD` Phase 3 joint Equation 18 cross-horizon Stute test: paper's step 2 of the four-step pre-testing workflow tests joint pre-trends via a stacked-residual CvM across pre-period placebos. Phase 3 shipped the single-horizon Stute in `did_had_pretest_workflow()`; the joint variant needs the exact stacked-residual formula extracted from the paper PDF (not reproduced in `dechaisemartin-2026-review.md`). Follow-up patch. | `diff_diff/had_pretests.py` | Phase 3 | Medium |
+| `HeterogeneousAdoptionDiD` Phase 3 Stute performance: Appendix D vectorized matrix form replaces the per-iteration OLS refit with a single precomputed `M = I - X(X'X)^{-1}X'` applied to `eps * eta`. Functionally identical, ~2x faster. Shipped literal-refit form in Phase 3 to match paper text and keep reviewer surface small. | `diff_diff/had_pretests.py::stute_test` | Phase 3 | Low |
+| `HeterogeneousAdoptionDiD` Phase 3 R-parity: Phase 3 ships coverage-rate validation on synthetic DGPs (not tight point parity against `chaisemartin::stute_test` / `yatchew_test`). Tight numerical parity requires aligning bootstrap seed semantics and `B` across numpy/R and is deferred. | `tests/test_had_pretests.py` | Phase 3 | Low |
+| `HeterogeneousAdoptionDiD` Phase 3 nprobust bandwidth for Stute: some Stute variants on continuous regressors use nprobust-style optimal bandwidth selection. Phase 3 uses OLS residuals from a 2-parameter linear fit (no bandwidth selection). nprobust integration is a future enhancement; not in paper scope. | `diff_diff/had_pretests.py::stute_test` | Phase 3 | Low |
+| `HeterogeneousAdoptionDiD` Phase 3 multi-period workflow dispatch: `did_had_pretest_workflow` accepts two-period overall-path panels only. Multi-period users pre-slice to `(F-1, F)` before calling. A follow-up could add `aggregate="event_study"`-like dispatch for joint pre-trend diagnostics alongside Equation 18. | `diff_diff/had_pretests.py::did_had_pretest_workflow` | Phase 3 | Low |
 | `HeterogeneousAdoptionDiD` Phase 4: Pierce-Schott (2016) replication harness; reproduce paper Figure 2 values and Table 1 coverage rates. | `benchmarks/`, `tests/` | Phase 2a | Low |
 | `HeterogeneousAdoptionDiD` Phase 5: `practitioner_next_steps()` integration, tutorial notebook, and `llms.txt` updates (preserving UTF-8 fingerprint). | `diff_diff/practitioner.py`, `tutorials/`, `diff_diff/guides/` | Phase 2a | Low |
 | `HeterogeneousAdoptionDiD` time-varying dose on event study: Phase 2b REJECTS panels where `D_{g,t}` varies within a unit for `t >= F` (the aggregation uses `D_{g, F}` as the single regressor for all horizons, paper Appendix B.2 constant-dose convention). A follow-up PR could add a time-varying-dose estimator for these panels; current behavior is front-door rejection with a redirect to `ChaisemartinDHaultfoeuille`. | `diff_diff/had.py::_validate_had_panel_event_study` | Phase 2b | Low |
diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py
index 7c04bf50..3dcb8d26 100644
--- a/diff_diff/__init__.py
+++ b/diff_diff/__init__.py
@@ -60,6 +60,16 @@
     HeterogeneousAdoptionDiDEventStudyResults,
     HeterogeneousAdoptionDiDResults,
 )
+from diff_diff.had_pretests import (
+    HADPretestReport,
+    QUGTestResults,
+    StuteTestResults,
+    YatchewTestResults,
+    did_had_pretest_workflow,
+    qug_test,
+    stute_test,
+    yatchew_hr_test,
+)
 from diff_diff.estimators import (
     DifferenceInDifferences,
     MultiPeriodDiD,
@@ -442,6 +452,15 @@
     "HeterogeneousAdoptionDiDResults",
     "HeterogeneousAdoptionDiDEventStudyResults",
     "HAD",
+    # HeterogeneousAdoptionDiD pre-test diagnostics (Phase 3)
+    "qug_test",
+    "stute_test",
+    "yatchew_hr_test",
+    "did_had_pretest_workflow",
+    "QUGTestResults",
+    "StuteTestResults",
+    "YatchewTestResults",
+    "HADPretestReport",
     # Datasets
     "load_card_krueger",
     "load_castle_doctrine",
diff --git a/diff_diff/had_pretests.py b/diff_diff/had_pretests.py
new file mode 100644
index 00000000..fff55aea
--- /dev/null
+++ b/diff_diff/had_pretests.py
@@ -0,0 +1,1415 @@
+"""Pre-test diagnostics for the HeterogeneousAdoptionDiD estimator.
+
+Paper Section 4 (de Chaisemartin, Ciccia, D'Haultfoeuille, Knau 2026,
+arXiv:2405.04465v6) prescribes a four-step pre-testing workflow for TWFE
+validity in HADs. Phase 3 ships steps 1 and 3 of that workflow (step 2 is
+deferred):
+
+1. :func:`qug_test` - order-statistic ratio test of the support infimum
+   ``H_0: d_lower = 0`` (paper Theorem 4). Closed-form, tuning-free.
+2. :func:`stute_test` - Cramer-von Mises cusum test of linearity of
+   ``E[ΔY | D_2]`` with Mammen (1993) wild bootstrap p-value (paper
+   Appendix D).
+3. :func:`yatchew_hr_test` - heteroskedasticity-robust variance-ratio
+   linearity test (paper Theorem 7 / Equation 29). Feasible at
+   ``G >= 100k``.
+
+The composite :func:`did_had_pretest_workflow` runs the three implemented
+tests in sequence on a two-period HAD panel and returns a
+:class:`HADPretestReport` with a partial-workflow verdict. When all three
+fail-to-reject, the verdict explicitly flags that **the paper's step 2
+pre-trends test (Assumption 7) is NOT run** — callers do not receive an
+unconditional "TWFE safe" signal; the Assumption 7 check must be performed
+separately (e.g., via an event-study / placebo analysis) until the Phase 3
+follow-up patch lands the joint Equation 18 cross-horizon Stute variant.
+
+See ``docs/methodology/REGISTRY.md`` and ``TODO.md`` for the deferred items.
+"""
+
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+from diff_diff.had import (
+    _aggregate_first_difference,
+    _json_safe_scalar,
+    _validate_had_panel,
+)
+from diff_diff.utils import _generate_mammen_weights
+
+__all__ = [
+    "QUGTestResults",
+    "StuteTestResults",
+    "YatchewTestResults",
+    "HADPretestReport",
+    "qug_test",
+    "stute_test",
+    "yatchew_hr_test",
+    "did_had_pretest_workflow",
+]
+
+
+_MIN_G_QUG = 2
+_MIN_G_STUTE = 10
+_MIN_G_YATCHEW = 3
+_MIN_N_BOOTSTRAP = 99
+_STUTE_LARGE_G_THRESHOLD = 100_000
+
+# Scale-invariant tolerance for detecting a numerically exact linear OLS fit.
+# The ratio SSR / TSS = sum(eps^2) / sum((dy - dybar)^2) equals 1 - R^2
+# and is BOTH TRANSLATION-INVARIANT (centering absorbs additive shifts)
+# and SCALE-INVARIANT (the ratio is dimensionless under multiplicative
+# rescaling of dy). Under exact Assumption 8, residuals are mathematically
+# zero; in practice FP round-off leaves eps on the order of machine-epsilon
+# (~1e-16). Squared that is ~1e-32. The threshold ~1e-24 leaves ~10^8
+# accumulated FP operations of margin so genuinely-noisy data is never
+# mis-classified.
+#
+# IMPORTANT: the comparison is purely ``eps^2 <= tol * dy_centered^2`` with
+# NO additive floor (e.g. ``max(dy_centered^2, 1.0)`` would break scale
+# invariance - scaling dy by 1e-12 would make dy_centered^2 ~ 1e-24 but
+# the floor would hold the threshold at 1.0, firing the shortcut on
+# noisy data that should not trigger it). The ``dy_centered^2 == 0``
+# edge case (constant dy) is handled by a separate branch above the
+# relative comparison, so the relative form is only applied when the
+# denominator is genuinely positive.
+_EXACT_LINEAR_RELATIVE_TOL = 1e-24
+
+
+# =============================================================================
+# Result dataclasses
+# =============================================================================
+
+
+@dataclass
+class QUGTestResults:
+    """Result of :func:`qug_test` (paper Theorem 4).
+
+    The QUG test rejects ``H_0: d_lower = 0`` when the order-statistic
+    ratio ``T = D_{(1)} / (D_{(2)} - D_{(1)})`` exceeds ``1/alpha - 1``.
+    Under the null, the asymptotic limit law of ``T`` is the ratio of two
+    independent Exp(1) random variables, with CDF ``F(t) = t / (1 + t)``,
+    so ``p_value = 1 / (1 + T)``.
+
+    Attributes
+    ----------
+    t_stat : float
+        ``D_{(1)} / (D_{(2)} - D_{(1)})``. NaN when fewer than 2 non-zero
+        observations remain or when the two smallest doses tie.
+    p_value : float
+        ``1 / (1 + t_stat)`` under the null. NaN when ``t_stat`` is NaN.
+    reject : bool
+        ``True`` iff ``t_stat > critical_value``. ``False`` on NaN statistic.
+    alpha : float
+        Significance level used.
+    critical_value : float
+        ``1 / alpha - 1``. Populated even when the statistic is NaN so
+        downstream readers can inspect the decision threshold.
+    n_obs : int
+        Number of observations after filtering to ``d > 0``.
+    n_excluded_zero : int
+        Number of zero-dose observations excluded from the sample.
+    d_order_1 : float
+        Smallest positive dose ``D_{(1)}``. NaN when ``n_obs < 2``.
+    d_order_2 : float
+        Second-smallest positive dose ``D_{(2)}``. NaN when ``n_obs < 2``.
+    """
+
+    t_stat: float
+    p_value: float
+    reject: bool
+    alpha: float
+    critical_value: float
+    n_obs: int
+    n_excluded_zero: int
+    d_order_1: float
+    d_order_2: float
+
+    def __repr__(self) -> str:
+        return (
+            f"QUGTestResults(t_stat={self.t_stat:.4f}, p_value={self.p_value:.4f}, "
+            f"reject={self.reject}, alpha={self.alpha}, n_obs={self.n_obs})"
+        )
+
+    def summary(self) -> str:
+        """Formatted summary table."""
+        width = 64
+        lines = [
+            "=" * width,
+            "QUG null test (H_0: d_lower = 0)".center(width),
+            "=" * width,
+            f"{'Statistic T:':<30} {self.t_stat:>20.4f}",
+            f"{'p-value:':<30} {self.p_value:>20.4f}",
+            f"{'Critical value (1/alpha-1):':<30} {self.critical_value:>20.4f}",
+            f"{'Reject H_0:':<30} {str(self.reject):>20}",
+            f"{'alpha:':<30} {self.alpha:>20.4f}",
+            f"{'Observations:':<30} {self.n_obs:>20}",
+            f"{'Excluded (d == 0):':<30} {self.n_excluded_zero:>20}",
+            f"{'D_(1):':<30} {self.d_order_1:>20.4f}",
+            f"{'D_(2):':<30} {self.d_order_2:>20.4f}",
+            "=" * width,
+        ]
+        return "\n".join(lines)
+
+    def print_summary(self) -> None:
+        """Print the summary to stdout."""
+        print(self.summary())
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return results as a JSON-safe dict."""
+        return {
+            "test": "qug",
+            "t_stat": _json_safe_scalar(self.t_stat),
+            "p_value": _json_safe_scalar(self.p_value),
+            "reject": bool(self.reject),
+            "alpha": float(self.alpha),
+            "critical_value": _json_safe_scalar(self.critical_value),
+            "n_obs": int(self.n_obs),
+            "n_excluded_zero": int(self.n_excluded_zero),
+            "d_order_1": _json_safe_scalar(self.d_order_1),
+            "d_order_2": _json_safe_scalar(self.d_order_2),
+        }
+
+    def to_dataframe(self) -> pd.DataFrame:
+        """Return a one-row DataFrame of the result dict."""
+        return pd.DataFrame([self.to_dict()])
+
+
+@dataclass
+class StuteTestResults:
+    """Result of :func:`stute_test` (paper Appendix D).
+
+    The Stute test rejects the null that ``E[ΔY | D_2]`` is linear in
+    ``D_2`` (paper Assumption 8) when the sorted-residual CvM statistic
+    ``S = (1/G^2) Σ (Σ_{h=1}^g eps_{(h)})^2`` exceeds the Mammen wild
+    bootstrap ``1 - alpha`` quantile.
+
+    Attributes
+    ----------
+    cvm_stat : float
+        CvM statistic. NaN when ``G < 10`` (below the threshold the
+        statistic is not well-calibrated).
+    p_value : float
+        Bootstrap p-value ``(1 + sum(S_b >= S)) / (B + 1)``. NaN when
+        the statistic is NaN.
+    reject : bool
+        ``True`` iff ``p_value <= alpha``. ``False`` on NaN.
+    alpha : float
+        Significance level used.
+    n_bootstrap : int
+        Number of Mammen wild bootstrap replications.
+    n_obs : int
+        Number of observations.
+    seed : int or None
+        Seed passed to ``np.random.default_rng``. ``None`` when unseeded.
+    """
+
+    cvm_stat: float
+    p_value: float
+    reject: bool
+    alpha: float
+    n_bootstrap: int
+    n_obs: int
+    seed: Optional[int]
+
+    def __repr__(self) -> str:
+        return (
+            f"StuteTestResults(cvm_stat={self.cvm_stat:.4f}, "
+            f"p_value={self.p_value:.4f}, reject={self.reject}, "
+            f"alpha={self.alpha}, n_bootstrap={self.n_bootstrap}, "
+            f"n_obs={self.n_obs})"
+        )
+
+    def summary(self) -> str:
+        """Formatted summary table."""
+        width = 64
+        lines = [
+            "=" * width,
+            "Stute CvM linearity test (H_0: linear E[dY|D])".center(width),
+            "=" * width,
+            f"{'CvM statistic:':<30} {self.cvm_stat:>20.4f}",
+            f"{'Bootstrap p-value:':<30} {self.p_value:>20.4f}",
+            f"{'Reject H_0:':<30} {str(self.reject):>20}",
+            f"{'alpha:':<30} {self.alpha:>20.4f}",
+            f"{'Bootstrap replications:':<30} {self.n_bootstrap:>20}",
+            f"{'Observations:':<30} {self.n_obs:>20}",
+            f"{'Seed:':<30} {str(self.seed):>20}",
+            "=" * width,
+        ]
+        return "\n".join(lines)
+
+    def print_summary(self) -> None:
+        """Print the summary to stdout."""
+        print(self.summary())
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return results as a JSON-safe dict."""
+        return {
+            "test": "stute",
+            "cvm_stat": _json_safe_scalar(self.cvm_stat),
+            "p_value": _json_safe_scalar(self.p_value),
+            "reject": bool(self.reject),
+            "alpha": float(self.alpha),
+            "n_bootstrap": int(self.n_bootstrap),
+            "n_obs": int(self.n_obs),
+            "seed": None if self.seed is None else int(self.seed),
+        }
+
+    def to_dataframe(self) -> pd.DataFrame:
+        """Return a one-row DataFrame of the result dict."""
+        return pd.DataFrame([self.to_dict()])
+
+
+@dataclass
+class YatchewTestResults:
+    """Result of :func:`yatchew_hr_test` (paper Theorem 7 / Equation 29).
+
+    Heteroskedasticity-robust test of the same linearity null as
+    :func:`stute_test`, but using Yatchew's difference-based variance
+    estimator. The test statistic
+    ``T_hr = sqrt(G) * (sigma2_lin - sigma2_diff) / sigma2_W``
+    is asymptotically N(0, 1) under H_0; rejection uses the one-sided
+    standard-normal critical value.
+
+    Attributes
+    ----------
+    t_stat_hr : float
+        Test statistic ``T_hr`` from paper Equation 29. NaN when
+        ``G < 3``.
+    p_value : float
+        ``1 - Phi(T_hr)``. NaN when the statistic is NaN.
+    reject : bool
+        ``True`` iff ``T_hr >= critical_value``. ``False`` on NaN.
+    alpha : float
+        Significance level used.
+    critical_value : float
+        One-sided standard-normal critical value ``z_{1 - alpha}``.
+    sigma2_lin : float
+        Residual variance from OLS of ``dy`` on ``d``.
+    sigma2_diff : float
+        Yatchew differencing variance
+        ``(1 / (2G)) * sum((dy_{(g)} - dy_{(g-1)})^2)`` - divisor is ``2G``
+        (paper-literal), NOT ``2(G-1)``.
+    sigma2_W : float
+        Heteroskedasticity-robust scale
+        ``sqrt((1 / (G-1)) * sum(eps_{(g)}^2 * eps_{(g-1)}^2))``.
+    n_obs : int
+        Number of observations.
+    """
+
+    t_stat_hr: float
+    p_value: float
+    reject: bool
+    alpha: float
+    critical_value: float
+    sigma2_lin: float
+    sigma2_diff: float
+    sigma2_W: float
+    n_obs: int
+
+    def __repr__(self) -> str:
+        return (
+            f"YatchewTestResults(t_stat_hr={self.t_stat_hr:.4f}, "
+            f"p_value={self.p_value:.4f}, reject={self.reject}, "
+            f"alpha={self.alpha}, n_obs={self.n_obs})"
+        )
+
+    def summary(self) -> str:
+        """Formatted summary table."""
+        width = 64
+        lines = [
+            "=" * width,
+            "Yatchew-HR linearity test (H_0: linear E[dY|D])".center(width),
+            "=" * width,
+            f"{'T_hr statistic:':<30} {self.t_stat_hr:>20.4f}",
+            f"{'p-value:':<30} {self.p_value:>20.4f}",
+            f"{'Critical value (1-sided z):':<30} {self.critical_value:>20.4f}",
+            f"{'Reject H_0:':<30} {str(self.reject):>20}",
+            f"{'alpha:':<30} {self.alpha:>20.4f}",
+            f"{'sigma^2_lin (OLS):':<30} {self.sigma2_lin:>20.4f}",
+            f"{'sigma^2_diff (Yatchew):':<30} {self.sigma2_diff:>20.4f}",
+            f"{'sigma^2_W (HR scale):':<30} {self.sigma2_W:>20.4f}",
+            f"{'Observations:':<30} {self.n_obs:>20}",
+            "=" * width,
+        ]
+        return "\n".join(lines)
+
+    def print_summary(self) -> None:
+        """Print the summary to stdout."""
+        print(self.summary())
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return results as a JSON-safe dict."""
+        return {
+            "test": "yatchew_hr",
+            "t_stat_hr": _json_safe_scalar(self.t_stat_hr),
+            "p_value": _json_safe_scalar(self.p_value),
+            "reject": bool(self.reject),
+            "alpha": float(self.alpha),
+            "critical_value": _json_safe_scalar(self.critical_value),
+            "sigma2_lin": _json_safe_scalar(self.sigma2_lin),
+            "sigma2_diff": _json_safe_scalar(self.sigma2_diff),
+            "sigma2_W": _json_safe_scalar(self.sigma2_W),
+            "n_obs": int(self.n_obs),
+        }
+
+    def to_dataframe(self) -> pd.DataFrame:
+        """Return a one-row DataFrame of the result dict."""
+        return pd.DataFrame([self.to_dict()])
+
+
+@dataclass
+class HADPretestReport:
+    """Composite output of :func:`did_had_pretest_workflow`.
+
+    Bundles the three individual tests with an overall verdict string.
+
+    .. important::
+        This report reflects a **partial** workflow: Phase 3 ships paper
+        Sections 4.2-4.3 steps 1 (QUG) and 3 (linearity via Stute +
+        Yatchew-HR), but **NOT** step 2 (Assumption 7 pre-trends test via
+        Equation 18). Even when ``all_pass`` is ``True``, the paper's
+        four-step certification for TWFE validity is incomplete — pre-
+        trends testing is a separate diagnostic that must be run via the
+        user's own event-study / placebo analysis until the Phase 3
+        follow-up patch lands the joint Equation 18 Stute test.
+
+    Attributes
+    ----------
+    qug : QUGTestResults
+    stute : StuteTestResults
+    yatchew : YatchewTestResults
+    all_pass : bool
+        ``True`` iff (a) QUG is conclusive (step 1), (b) at least ONE of
+        Stute / Yatchew is conclusive (step 3 - paper's "Stute or
+        Yatchew" wording), AND (c) no conclusive test rejects. This
+        gating follows the paper's four-step workflow exactly: step 3
+        accepts either linearity test, so a conclusive Stute is
+        sufficient even when Yatchew returns NaN (e.g. tied-dose
+        panels). Even when ``all_pass`` is ``True``, the report is a
+        PARTIAL indicator: it does not certify Assumption 7 (pre-trends),
+        which is not tested by Phase 3.
+    verdict : str
+        Human-readable classification. Paper rule: TWFE is admissible
+        only if NONE of the implemented tests rejects. A conclusive
+        rejection must therefore never be hidden by a purely-inconclusive
+        verdict just because another step happens to be NaN.
+
+        Priority:
+
+        1. If any CONCLUSIVE test rejected, that is the primary verdict:
+           a bundled string naming each failed assumption,
+           ``"support infimum rejected - continuous_at_zero design
+           invalid (QUG)"`` and/or ``"linearity rejected - heterogeneity
+           bias ({Stute[,Yatchew]})"``. If another step is unresolved
+           (QUG NaN, or BOTH linearity tests NaN), an
+           ``"; additional steps unresolved: ..."`` suffix is APPENDED
+           rather than replacing the rejection.
+        2. If no conclusive rejection but a required step is unresolved,
+           the verdict is ``"inconclusive - QUG NaN"`` when step 1 is
+           the only unresolved piece, ``"inconclusive - both Stute and
+           Yatchew linearity tests NaN"`` when step 3 lacks any
+           conclusive linearity test, or ``"inconclusive - QUG NaN;
+           both Stute and Yatchew linearity tests NaN"`` when both are
+           unresolved.
+        3. Otherwise (all required steps conclusive and none reject),
+           the partial-workflow fail-to-reject verdict:
+           ``"QUG and linearity diagnostics fail-to-reject[ (Yatchew
+           NaN - skipped)]; Assumption 7 pre-trends test NOT run (paper
+           step 2 deferred to Phase 3 follow-up)"``. The
+           ``" (... - skipped)"`` suffix appears when Stute OR Yatchew
+           was NaN but the other was conclusive (step 3 resolved via
+           the paper's "Stute OR Yatchew" wording).
+    alpha : float
+        Significance level shared across tests.
+    n_obs : int
+        Unit count after aggregation to the two-period first-difference.
+    """
+
+    qug: QUGTestResults
+    stute: StuteTestResults
+    yatchew: YatchewTestResults
+    all_pass: bool
+    verdict: str
+    alpha: float
+    n_obs: int
+
+    def __repr__(self) -> str:
+        return (
+            f"HADPretestReport(all_pass={self.all_pass}, "
+            f"verdict={self.verdict!r}, n_obs={self.n_obs})"
+        )
+
+    def summary(self) -> str:
+        """Formatted summary of all three tests and the verdict."""
+        width = 72
+        parts = [
+            "=" * width,
+            "HAD pre-test workflow".center(width),
+            "=" * width,
+            self.qug.summary(),
+            "",
+            self.stute.summary(),
+            "",
+            self.yatchew.summary(),
+            "",
+            "=" * width,
+            f"{'All pass:':<30} {str(self.all_pass):>40}",
+            f"Verdict: {self.verdict}",
+            "=" * width,
+        ]
+        return "\n".join(parts)
+
+    def print_summary(self) -> None:
+        """Print the summary to stdout."""
+        print(self.summary())
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return a JSON-safe nested dict of the full report."""
+        return {
+            "qug": self.qug.to_dict(),
+            "stute": self.stute.to_dict(),
+            "yatchew": self.yatchew.to_dict(),
+            "all_pass": bool(self.all_pass),
+            "verdict": str(self.verdict),
+            "alpha": float(self.alpha),
+            "n_obs": int(self.n_obs),
+        }
+
+    def to_dataframe(self) -> pd.DataFrame:
+        """Return a tidy 3-row DataFrame (one row per test).
+
+        Columns (in order): ``[test, statistic_name, statistic_value,
+        p_value, reject, alpha, n_obs]``.
+        """
+        rows = [
+            {
+                "test": "qug",
+                "statistic_name": "t_stat",
+                "statistic_value": _json_safe_scalar(self.qug.t_stat),
+                "p_value": _json_safe_scalar(self.qug.p_value),
+                "reject": bool(self.qug.reject),
+                "alpha": float(self.qug.alpha),
+                "n_obs": int(self.qug.n_obs),
+            },
+            {
+                "test": "stute",
+                "statistic_name": "cvm_stat",
+                "statistic_value": _json_safe_scalar(self.stute.cvm_stat),
+                "p_value": _json_safe_scalar(self.stute.p_value),
+                "reject": bool(self.stute.reject),
+                "alpha": float(self.stute.alpha),
+                "n_obs": int(self.stute.n_obs),
+            },
+            {
+                "test": "yatchew_hr",
+                "statistic_name": "t_stat_hr",
+                "statistic_value": _json_safe_scalar(self.yatchew.t_stat_hr),
+                "p_value": _json_safe_scalar(self.yatchew.p_value),
+                "reject": bool(self.yatchew.reject),
+                "alpha": float(self.yatchew.alpha),
+                "n_obs": int(self.yatchew.n_obs),
+            },
+        ]
+        cols = [
+            "test",
+            "statistic_name",
+            "statistic_value",
+            "p_value",
+            "reject",
+            "alpha",
+            "n_obs",
+        ]
+        return pd.DataFrame(rows).reindex(columns=cols)
+
+
+# =============================================================================
+# Private helpers
+# =============================================================================
+
+
+def _validate_1d_numeric(arr: np.ndarray, name: str) -> np.ndarray:
+    """Return ``arr`` as a 1D float ndarray or raise ``ValueError``."""
+    a = np.asarray(arr)
+    if a.ndim != 1:
+        raise ValueError(f"{name} must be 1-dimensional, got shape {a.shape}.")
+    a = a.astype(np.float64, copy=False)
+    if np.isnan(a).any():
+        raise ValueError(f"{name} contains NaN values.")
+    if not np.isfinite(a).all():
+        raise ValueError(f"{name} contains non-finite values (inf).")
+    return a
+
+
+def _fit_ols_intercept_slope(d: np.ndarray, dy: np.ndarray) -> "tuple[float, float, np.ndarray]":
+    """Fit ``dy = a + b*d + eps`` via closed-form OLS.
+
+    Returns ``(a_hat, b_hat, residuals)`` where ``residuals`` has the
+    same length as ``d`` in the ORIGINAL input order (not sorted).
+    """
+    d_mean = d.mean()
+    dy_mean = dy.mean()
+    d_dev = d - d_mean
+    var_d = np.dot(d_dev, d_dev)
+    if var_d <= 0.0:
+        # Degenerate case: all dose values equal. Slope undefined.
+        # Caller is responsible for gating before we reach here; if we
+        # do reach here, return (mean(dy), 0, dy - mean(dy)).
+        return float(dy_mean), 0.0, dy - dy_mean
+    b_hat = float(np.dot(d_dev, dy - dy_mean) / var_d)
+    a_hat = float(dy_mean - b_hat * d_mean)
+    residuals = dy - a_hat - b_hat * d
+    return a_hat, b_hat, residuals
+
+
+def _cvm_statistic(eps_sorted: np.ndarray, d_sorted: np.ndarray) -> float:
+    """Compute the tie-safe Cramer-von Mises cusum statistic.
+
+    Paper definition (Appendix D):
+
+        c_G(d) := G^{-1/2} * sum_g 1{D_g <= d} * eps_g
+        S := (1/G) * sum_g c_G^2(D_g) = (1/G^2) * sum_g (C_g)^2
+
+    where ``C_g = sum_{h : D_h <= D_g} eps_h`` is the cumulative residual
+    sum up to and including ALL observations with dose <= D_g. This
+    definition is tie-safe: at a tied dose value ``D_g == D_{g+1}``, both
+    c_G(D_g) and c_G(D_{g+1}) include all tie-block members, so the
+    cumulative sum used at each tied observation is the cumulative sum
+    through the END of the tie block.
+
+    A naive per-observation ``cumsum`` on sorted residuals violates this
+    at tie blocks (each tied observation sees a partial within-block
+    cumulative sum). This implementation collapses each tie block to the
+    post-tie cumulative sum before squaring, matching the paper definition.
+
+    Parameters
+    ----------
+    eps_sorted : np.ndarray, shape (G,)
+        Residuals sorted by ``d_sorted``.
+    d_sorted : np.ndarray, shape (G,)
+        Regressor values sorted ascending. Must be sorted consistently
+        with ``eps_sorted``.
+
+    Returns
+    -------
+    float
+        ``S = (1 / G^2) * sum_g C_g^2``.
+    """
+    G = eps_sorted.shape[0]
+    cumsum = np.cumsum(eps_sorted)
+    # Tie-safe correction: replace within-tie-block values with the
+    # cumulative sum at the END of each tie block. np.unique on the
+    # already-sorted regressor gives per-unique-value counts; the last
+    # index of each tie block is `cumsum(counts) - 1`, and np.repeat
+    # expands that back to per-observation.
+    _, counts = np.unique(d_sorted, return_counts=True)
+    tie_end_idx = np.cumsum(counts) - 1
+    cumsum_tie_safe = np.repeat(cumsum[tie_end_idx], counts)
+    return float(np.sum(cumsum_tie_safe * cumsum_tie_safe) / (G * G))
+
+
+def _compose_verdict(
+    qug: QUGTestResults, stute: StuteTestResults, yatchew: YatchewTestResults
+) -> str:
+    """Build the :class:`HADPretestReport` verdict string.
+
+    Paper Section 4.2-4.3 specifies a four-step workflow; Phase 3 ships
+    step 1 (QUG) and step 3 (linearity, via ``stute_test`` OR
+    ``yatchew_hr_test``). The linearity step accepts either test, so a
+    conclusive Stute result alone suffices even when Yatchew is NaN
+    (e.g. tied doses, which Yatchew rejects by contract).
+
+    Paper logic: TWFE is admissible only if NONE of the implemented
+    tests rejects. A conclusive rejection must therefore never be hidden
+    by a purely-inconclusive verdict just because another step is NaN -
+    it is reported as the primary outcome and any unresolved steps are
+    appended as a suffix.
+
+    Priority:
+
+    1. Collect all rejection reasons from CONCLUSIVE tests. If any
+       conclusive test rejected, that is the primary verdict. Unresolved
+       steps (QUG NaN, or BOTH linearity tests NaN) are appended as
+       ``"; additional steps unresolved: ..."`` rather than replacing
+       the rejection.
+    2. If no conclusive test rejected but a required step is unresolved,
+       return a pure ``"inconclusive - ..."`` verdict naming the
+       unresolved step(s).
+    3. Otherwise (all required steps conclusive and none reject),
+       return the partial-workflow fail-to-reject verdict flagging the
+       Assumption 7 gap, with a ``" (Yatchew NaN - skipped)"`` suffix
+       when ONE linearity test was NaN and the other was conclusive.
+    """
+    qug_ok = bool(np.isfinite(qug.p_value))
+    stute_ok = bool(np.isfinite(stute.p_value))
+    yatchew_ok = bool(np.isfinite(yatchew.p_value))
+
+    # Rejections from conclusive tests only. NaN-p tests have reject=False
+    # by convention, so the ``ok and reject`` guard is defensive.
+    qug_rej = qug_ok and qug.reject
+    stute_rej = stute_ok and stute.reject
+    yatchew_rej = yatchew_ok and yatchew.reject
+
+    reasons = []
+    if qug_rej:
+        reasons.append("support infimum rejected - continuous_at_zero design invalid (QUG)")
+    if stute_rej or yatchew_rej:
+        which = ",".join(
+            name for name, rejected in (("Stute", stute_rej), ("Yatchew", yatchew_rej)) if rejected
+        )
+        reasons.append(f"linearity rejected - heterogeneity bias ({which})")
+
+    # Unresolved steps: QUG is required; step 3 requires at least one
+    # conclusive linearity test.
+    unresolved = []
+    if not qug_ok:
+        unresolved.append("QUG NaN")
+    if not stute_ok and not yatchew_ok:
+        unresolved.append("both Stute and Yatchew linearity tests NaN")
+
+    if reasons:
+        # A conclusive rejection is the primary outcome. Append any
+        # unresolved-step note rather than replacing the rejection.
+        verdict = "; ".join(reasons)
+        if unresolved:
+            verdict += "; additional steps unresolved: " + "; ".join(unresolved)
+        return verdict
+
+    if unresolved:
+        return "inconclusive - " + "; ".join(unresolved)
+
+    # All required steps conclusive, none reject. Note any single skipped
+    # linearity test (the OTHER linearity test was conclusive and
+    # fail-to-reject, so step 3 IS resolved).
+    skipped = []
+    if not stute_ok:
+        skipped.append("Stute NaN")
+    if not yatchew_ok:
+        skipped.append("Yatchew NaN")
+    skip_note = f" ({'; '.join(skipped)} - skipped)" if skipped else ""
+    return (
+        "QUG and linearity diagnostics fail-to-reject"
+        f"{skip_note}; Assumption 7 pre-trends test NOT run "
+        "(paper step 2 deferred to Phase 3 follow-up)"
+    )
+
+
+# =============================================================================
+# Public test functions
+# =============================================================================
+
+
+def qug_test(d: np.ndarray, alpha: float = 0.05) -> QUGTestResults:
+    """Run the QUG null test for the support infimum (paper Theorem 4).
+
+    Tests ``H_0: d_lower = 0`` using the order-statistic ratio
+    ``T = D_{(1)} / (D_{(2)} - D_{(1)})``, rejecting when ``T > 1/alpha - 1``.
+    Under the null, the asymptotic limit law of ``T`` is the ratio of two
+    independent Exp(1) variables with CDF ``F(t) = t / (1 + t)``, so the
+    one-sided p-value is ``1 / (1 + T)``.
+
+    Zero-dose observations are filtered out (the test targets the infimum
+    of the treated support). A ``UserWarning`` is emitted naming the
+    exclusion count. When fewer than two positive doses remain, the test
+    returns all-NaN inference with ``reject=False``.
+
+    Parameters
+    ----------
+    d : np.ndarray, shape (G,)
+        Post-period dose vector. Must be 1D numeric and contain no NaN.
+    alpha : float, default 0.05
+        One-sided significance level. Must satisfy ``0 < alpha < 1``.
+
+    Returns
+    -------
+    QUGTestResults
+        Result dataclass with ``t_stat``, ``p_value``, ``reject``, and
+        sample metadata.
+
+    Raises
+    ------
+    ValueError
+        If ``d`` is not 1D numeric or contains NaN, or if ``alpha`` is
+        not in ``(0, 1)``.
+
+    Notes
+    -----
+    Tie-break: when ``D_{(1)} == D_{(2)}`` the statistic is undefined.
+    The test returns ``t_stat=NaN, p_value=NaN, reject=False`` with a
+    ``UserWarning`` rather than raising.
+
+    References
+    ----------
+    de Chaisemartin, Ciccia, D'Haultfoeuille, Knau (2026, arXiv:2405.04465v6),
+    Theorem 4 and Section 4.2.
+    """
+    if not (0.0 < alpha < 1.0):
+        raise ValueError(f"alpha must satisfy 0 < alpha < 1, got {alpha}.")
+
+    d_arr = _validate_1d_numeric(d, "d")
+    critical_value = 1.0 / alpha - 1.0
+
+    # HAD support restriction: doses must be non-negative (paper Section 2).
+    # Reject negative doses at the front door rather than silently filtering
+    # them into the zero-exclusion counter.
+    if (d_arr < 0).any():
+        n_neg = int((d_arr < 0).sum())
+        raise ValueError(
+            f"qug_test: d contains {n_neg} negative value(s); HAD doses "
+            f"must be non-negative (paper Section 2). Check your dose "
+            f"column or pre-process before calling qug_test."
+        )
+
+    mask = d_arr > 0
+    d_nz = d_arr[mask]
+    n_excluded = int(d_arr.shape[0] - d_nz.shape[0])
+    if n_excluded > 0:
+        warnings.warn(
+            f"qug_test: excluded {n_excluded} observation(s) with d == 0 "
+            f"(the QUG null test targets the infimum of the treated-dose "
+            f"support; zero-dose observations are not in scope).",
+            UserWarning,
+            stacklevel=2,
+        )
+
+    n_obs = int(d_nz.shape[0])
+    if n_obs < _MIN_G_QUG:
+        warnings.warn(
+            f"qug_test: only {n_obs} positive-dose observation(s); need "
+            f"at least {_MIN_G_QUG}. Returning NaN result.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return QUGTestResults(
+            t_stat=float("nan"),
+            p_value=float("nan"),
+            reject=False,
+            alpha=alpha,
+            critical_value=critical_value,
+            n_obs=n_obs,
+            n_excluded_zero=n_excluded,
+            d_order_1=float("nan"),
+            d_order_2=float("nan"),
+        )
+
+    # Use np.partition for O(G) extraction of the two smallest positive
+    # doses (faster than full O(G log G) sort). For k=1, np.partition
+    # guarantees partitioned[0] <= partitioned[1] = D_{(2)} (the 2nd-smallest),
+    # which implies partitioned[0] = D_{(1)} (the minimum).
+    partitioned = np.partition(d_nz, 1)
+    D1 = float(partitioned[0])
+    D2 = float(partitioned[1])
+
+    if D2 == D1:
+        warnings.warn(
+            "qug_test: D_(1) == D_(2); the test statistic is undefined "
+            "(ties at the minimum positive dose). Returning NaN result.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return QUGTestResults(
+            t_stat=float("nan"),
+            p_value=float("nan"),
+            reject=False,
+            alpha=alpha,
+            critical_value=critical_value,
+            n_obs=n_obs,
+            n_excluded_zero=n_excluded,
+            d_order_1=D1,
+            d_order_2=D2,
+        )
+
+    t_stat = D1 / (D2 - D1)
+    p_value = 1.0 / (1.0 + t_stat)
+    reject = t_stat > critical_value
+
+    return QUGTestResults(
+        t_stat=float(t_stat),
+        p_value=float(p_value),
+        reject=bool(reject),
+        alpha=alpha,
+        critical_value=critical_value,
+        n_obs=n_obs,
+        n_excluded_zero=n_excluded,
+        d_order_1=D1,
+        d_order_2=D2,
+    )
+
+
+def stute_test(
+    d: np.ndarray,
+    dy: np.ndarray,
+    alpha: float = 0.05,
+    n_bootstrap: int = 999,
+    seed: Optional[int] = None,
+) -> StuteTestResults:
+    """Run the Stute Cramer-von Mises linearity test (paper Appendix D).
+
+    Tests ``H_0: E[ΔY | D_2]`` is linear in ``D_2`` (paper Assumption 8).
+    The test statistic is the sorted-residual cusum CvM
+
+        S = (1 / G^2) * sum_{g=1}^G (sum_{h=1}^g eps_(h))^2
+
+    where ``eps_(h)`` is the ``h``-th OLS residual after sorting by ``d``.
+    The p-value is the bootstrap tail probability
+    ``(1 + sum(S_b >= S)) / (B + 1)`` under the Mammen (1993) two-point
+    wild bootstrap; each bootstrap iteration refits OLS on
+    ``dy_b = a_hat + b_hat * d + eps * eta`` with multiplier weights ``eta``.
+
+    Parameters
+    ----------
+    d, dy : np.ndarray, shape (G,)
+        Dose and first-difference outcome vectors.
+    alpha : float, default 0.05
+        Significance level. Must satisfy ``0 < alpha < 1``.
+    n_bootstrap : int, default 999
+        Number of Mammen wild bootstrap replications. Must be ``>= 99``
+        (below which the discretised p-value grid is too coarse).
+    seed : int or None, default None
+        Seed for ``np.random.default_rng``. Pass an integer for
+        reproducible results.
+
+    Returns
+    -------
+    StuteTestResults
+
+    Raises
+    ------
+    ValueError
+        If ``d`` / ``dy`` are not 1D numeric, contain NaN, have unequal
+        lengths, if any ``d`` value is negative (paper Section 2 HAD
+        support restriction), if ``alpha`` is outside ``(0, 1)``, or if
+        ``n_bootstrap < 99``.
+
+    Notes
+    -----
+    Sample-size gate: below ``G = 10`` the CvM statistic is not
+    well-calibrated. In that case the function emits ``UserWarning`` and
+    returns all-NaN inference rather than raising.
+
+    Large-G warning: at ``G > 100_000`` the per-iteration refit dominates
+    runtime; the function emits a ``UserWarning`` pointing users to
+    :func:`yatchew_hr_test`. Memory usage remains ``O(G)`` regardless
+    (no G x G matrix).
+
+    References
+    ----------
+    Stute, W. (1997). Nonparametric model checks for regression. Annals
+    of Statistics 25, 613-641.
+    Mammen, E. (1993). Bootstrap and wild bootstrap for high-dimensional
+    linear models. Annals of Statistics 21, 255-285.
+    de Chaisemartin et al. (2026), Appendix D.
+    """
+    if not (0.0 < alpha < 1.0):
+        raise ValueError(f"alpha must satisfy 0 < alpha < 1, got {alpha}.")
+    if n_bootstrap < _MIN_N_BOOTSTRAP:
+        raise ValueError(
+            f"n_bootstrap must be >= {_MIN_N_BOOTSTRAP} (below this the "
+            f"discretised p-value grid is too coarse to be meaningful). "
+            f"Got n_bootstrap={n_bootstrap}."
+        )
+
+    d_arr = _validate_1d_numeric(d, "d")
+    dy_arr = _validate_1d_numeric(dy, "dy")
+    if d_arr.shape[0] != dy_arr.shape[0]:
+        raise ValueError(
+            f"d and dy must have the same length; got d.shape={d_arr.shape}, "
+            f"dy.shape={dy_arr.shape}."
+        )
+    # HAD support restriction (paper Section 2): doses must be non-negative.
+    # Mirror the front-door guard from qug_test / _validate_had_panel.
+    if (d_arr < 0).any():
+        n_neg = int((d_arr < 0).sum())
+        raise ValueError(
+            f"stute_test: d contains {n_neg} negative value(s); HAD doses "
+            f"must be non-negative (paper Section 2). Check your dose "
+            f"column or pre-process before calling stute_test."
+        )
+
+    G = int(d_arr.shape[0])
+    if G < _MIN_G_STUTE:
+        warnings.warn(
+            f"stute_test: G = {G} is below the minimum {_MIN_G_STUTE} for "
+            f"the CvM statistic to be well-calibrated. Returning NaN result.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return StuteTestResults(
+            cvm_stat=float("nan"),
+            p_value=float("nan"),
+            reject=False,
+            alpha=alpha,
+            n_bootstrap=int(n_bootstrap),
+            n_obs=G,
+            seed=seed,
+        )
+    if G > _STUTE_LARGE_G_THRESHOLD:
+        warnings.warn(
+            f"stute_test: G = {G} exceeds {_STUTE_LARGE_G_THRESHOLD}; the "
+            f"per-iteration refit is O(G) per iteration so the "
+            f"{n_bootstrap}-replication loop may take tens of seconds or "
+            f"more. Consider yatchew_hr_test() instead (paper Theorem 7 "
+            f"recommends Yatchew-HR at large G).",
+            UserWarning,
+            stacklevel=2,
+        )
+
+    a_hat, b_hat, eps = _fit_ols_intercept_slope(d_arr, dy_arr)
+    # Genuine degeneracy: zero dose variation. The CvM cusum is defined
+    # against the regressor, and constant d carries no signal to test
+    # linearity against - emit NaN.
+    if np.var(d_arr) <= 0.0:
+        warnings.warn(
+            "stute_test: constant d (zero dose variation); the Stute "
+            "linearity test requires regressor variation. Returning NaN result.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return StuteTestResults(
+            cvm_stat=float("nan"),
+            p_value=float("nan"),
+            reject=False,
+            alpha=alpha,
+            n_bootstrap=int(n_bootstrap),
+            n_obs=G,
+            seed=seed,
+        )
+
+    # Numerically exact linear fit: Assumption 8 holds to IEEE precision,
+    # so the Stute CvM statistic is formally 0 and every bootstrap draw is
+    # also 0. Short-circuit to p=1 to avoid FP-noise-driven bootstrap
+    # comparisons where cvm_stat and S_b are both at machine-epsilon scale.
+    # Comparison is purely relative against CENTERED TSS: both translation-
+    # invariant (centering absorbs additive shifts) and scale-invariant
+    # (ratio is dimensionless under multiplicative dy rescaling).
+    eps_norm_sq = float(np.sum(eps * eps))
+    dy_centered_sq = float(np.sum((dy_arr - dy_arr.mean()) ** 2))
+    if dy_centered_sq <= 0.0:
+        # Constant dy (zero centered TSS): trivially linear in d.
+        # Return p = 1 without running the bootstrap.
+        return StuteTestResults(
+            cvm_stat=0.0,
+            p_value=1.0,
+            reject=False,
+            alpha=alpha,
+            n_bootstrap=int(n_bootstrap),
+            n_obs=G,
+            seed=seed,
+        )
+    if eps_norm_sq <= _EXACT_LINEAR_RELATIVE_TOL * dy_centered_sq:
+        return StuteTestResults(
+            cvm_stat=0.0,
+            p_value=1.0,
+            reject=False,
+            alpha=alpha,
+            n_bootstrap=int(n_bootstrap),
+            n_obs=G,
+            seed=seed,
+        )
+
+    idx = np.argsort(d_arr, kind="stable")
+    d_sorted = d_arr[idx]
+    S = _cvm_statistic(eps[idx], d_sorted)
+
+    rng = np.random.default_rng(seed)
+    bootstrap_S = np.empty(n_bootstrap, dtype=np.float64)
+    fitted = a_hat + b_hat * d_arr  # baseline fitted values under H_0
+    for b in range(n_bootstrap):
+        eta = _generate_mammen_weights(G, rng)
+        dy_b = fitted + eps * eta
+        _, _, eps_b = _fit_ols_intercept_slope(d_arr, dy_b)
+        bootstrap_S[b] = _cvm_statistic(eps_b[idx], d_sorted)
+
+    p_value = float((1.0 + float(np.sum(bootstrap_S >= S))) / (n_bootstrap + 1.0))
+    reject = p_value <= alpha
+
+    return StuteTestResults(
+        cvm_stat=float(S),
+        p_value=p_value,
+        reject=bool(reject),
+        alpha=alpha,
+        n_bootstrap=int(n_bootstrap),
+        n_obs=G,
+        seed=seed,
+    )
+
+
+def yatchew_hr_test(d: np.ndarray, dy: np.ndarray, alpha: float = 0.05) -> YatchewTestResults:
+    """Run the Yatchew heteroskedasticity-robust linearity test.
+
+    Tests ``H_0: E[ΔY | D_2]`` is linear in ``D_2`` (paper Assumption 8,
+    Theorem 7) via the variance-ratio statistic
+
+        T_hr = sqrt(G) * (sigma2_lin - sigma2_diff) / sigma2_W
+
+    where
+
+        sigma2_lin   = (1/G) * sum(eps^2)                        # OLS residuals
+        sigma2_diff  = (1/(2G)) * sum((dy_{(g)} - dy_{(g-1)})^2) # Yatchew differencing
+        sigma2_W     = sqrt((1/(G-1)) * sum(eps_{(g)}^2 * eps_{(g-1)}^2))
+
+    and ``_{(g)}`` denotes sort by ``d``. Rejection uses the one-sided
+    standard-normal critical value ``z_{1-alpha}``.
+
+    Parameters
+    ----------
+    d, dy : np.ndarray, shape (G,)
+        Dose and first-difference outcome vectors.
+    alpha : float, default 0.05
+        One-sided significance level.
+
+    Returns
+    -------
+    YatchewTestResults
+
+    Raises
+    ------
+    ValueError
+        If ``d`` / ``dy`` are not 1D numeric, contain NaN, have unequal
+        lengths, if any ``d`` value is negative (paper Section 2 HAD
+        support restriction), or if ``alpha`` is outside ``(0, 1)``.
+
+    Notes
+    -----
+    Sample-size gate: below ``G = 3`` the difference-variance estimator
+    is undefined; the function emits ``UserWarning`` and returns NaN
+    rather than raising.
+
+    Dose ties: REJECTED with ``UserWarning`` + all-NaN result. The
+    difference-based variance estimator ``sigma2_diff`` and the
+    heteroskedasticity-robust scale ``sigma4_W`` both use adjacent
+    differences of quantities sorted by ``d``; under tied doses the
+    within-tie row ordering is arbitrary (stable sort falls back to input
+    order) so the statistic becomes order-dependent rather than
+    data-dependent. Callers with tied doses (mass-point designs,
+    discretised dose registers) should use :func:`stute_test` instead -
+    its tie-safe Cramer-von Mises statistic collapses tie blocks to the
+    post-tie cumulative sum and is provably order-invariant under
+    within-tie permutations.
+
+    Exact-linear short-circuit: when the OLS residual sum-of-squares is
+    below IEEE precision relative to the centered total sum of squares
+    (``sum(eps^2) <= 1e-24 * sum((dy - dybar)^2)``, i.e. essentially
+    ``1 - R^2 == 0``), the test short-circuits to ``t_stat_hr=-inf,
+    p_value=1.0, reject=False`` - Assumption 8 holds exactly, the formal
+    statistic is ``-inf`` under the one-sided critical value, and the
+    correct decision is fail-to-reject. This shortcut is translation-
+    invariant because the comparison is against centered TSS (not raw
+    ``sum(dy^2)``).
+
+    Degenerate ``sigma4_W = 0`` with non-zero residuals: when the
+    adjacent-residual-product sum vanishes AFTER the exact-linear
+    shortcut is bypassed (e.g. residuals alternate zero/non-zero after
+    sorting), the formal statistic is ``+inf`` or ``-inf`` depending on
+    the sign of the numerator ``sigma2_lin - sigma2_diff``. The function
+    returns the sign-aware limit (``p=0, reject=True`` for positive
+    numerator; ``p=1, reject=False`` for negative; ``NaN`` for zero)
+    with a ``UserWarning``, rather than unconditionally mapping this to
+    ``p=1`` (which would flip a legitimate rejection).
+
+    References
+    ----------
+    Yatchew, A. (1997). An elementary estimator of the partial linear
+    model. Economics Letters 57, 135-143.
+    de Chaisemartin et al. (2026), Theorem 7 / Equation 29.
+    """
+    if not (0.0 < alpha < 1.0):
+        raise ValueError(f"alpha must satisfy 0 < alpha < 1, got {alpha}.")
+
+    d_arr = _validate_1d_numeric(d, "d")
+    dy_arr = _validate_1d_numeric(dy, "dy")
+    if d_arr.shape[0] != dy_arr.shape[0]:
+        raise ValueError(
+            f"d and dy must have the same length; got d.shape={d_arr.shape}, "
+            f"dy.shape={dy_arr.shape}."
+        )
+    # HAD support restriction (paper Section 2): doses must be non-negative.
+    # Mirror the front-door guard from qug_test / _validate_had_panel.
+    if (d_arr < 0).any():
+        n_neg = int((d_arr < 0).sum())
+        raise ValueError(
+            f"yatchew_hr_test: d contains {n_neg} negative value(s); HAD "
+            f"doses must be non-negative (paper Section 2). Check your "
+            f"dose column or pre-process before calling yatchew_hr_test."
+        )
+
+    G = int(d_arr.shape[0])
+    critical_value = float(stats.norm.ppf(1.0 - alpha))
+
+    if G < _MIN_G_YATCHEW:
+        warnings.warn(
+            f"yatchew_hr_test: G = {G} is below the minimum {_MIN_G_YATCHEW} "
+            f"(need at least 2 sorted differences). Returning NaN result.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return YatchewTestResults(
+            t_stat_hr=float("nan"),
+            p_value=float("nan"),
+            reject=False,
+            alpha=alpha,
+            critical_value=critical_value,
+            sigma2_lin=float("nan"),
+            sigma2_diff=float("nan"),
+            sigma2_W=float("nan"),
+            n_obs=G,
+        )
+
+    # Tie / constant-dose front-door guard. Yatchew's difference-based
+    # variance estimator uses adjacent differences of dy sorted by d;
+    # under tied doses the within-tie ordering is arbitrary (stable sort
+    # falls back to input row order), so the statistic becomes
+    # non-methodological and order-dependent. Reject at the front door
+    # with a UserWarning + NaN result rather than silently permuting.
+    # Mass-point designs and other tied-dose panels should use
+    # `stute_test` instead (its tie-safe CvM handles ties correctly).
+    n_unique_d = int(np.unique(d_arr).shape[0])
+    if n_unique_d < G:
+        n_dups = G - n_unique_d
+        warnings.warn(
+            f"yatchew_hr_test: d contains {n_dups} duplicate value(s) "
+            f"(only {n_unique_d} distinct dose values out of G={G}); "
+            f"the difference-based variance estimator is not well-defined "
+            f"under ties because adjacent-difference statistics depend on "
+            f"arbitrary within-tie row ordering. Use stute_test() instead "
+            f"(its tie-safe CvM handles ties correctly). Returning NaN result.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return YatchewTestResults(
+            t_stat_hr=float("nan"),
+            p_value=float("nan"),
+            reject=False,
+            alpha=alpha,
+            critical_value=critical_value,
+            sigma2_lin=float("nan"),
+            sigma2_diff=float("nan"),
+            sigma2_W=float("nan"),
+            n_obs=G,
+        )
+
+    _, _, eps = _fit_ols_intercept_slope(d_arr, dy_arr)
+    sigma2_lin = float(np.mean(eps * eps))
+
+    # Numerically exact linear fit: same short-circuit as `stute_test`.
+    # Assumption 8 holds to IEEE precision; the Yatchew statistic is
+    # formally -inf (finite-negative numerator over zero denominator),
+    # which maps to p = 1 under the one-sided standard-normal critical
+    # value. Short-circuit so FP noise in ``sigma4_W`` cannot produce a
+    # spuriously large finite ``T_hr``. Comparison is purely relative
+    # against CENTERED TSS - translation- AND scale-invariant.
+    eps_norm_sq = float(np.sum(eps * eps))
+    dy_centered_sq = float(np.sum((dy_arr - dy_arr.mean()) ** 2))
+    if dy_centered_sq <= 0.0 or eps_norm_sq <= _EXACT_LINEAR_RELATIVE_TOL * dy_centered_sq:
+        # Exact-linear branch. Covers two cases:
+        # - dy_centered_sq == 0: dy is constant (trivially linear).
+        # - relative SSR below IEEE precision: near-exact OLS fit.
+        # For reporting, compute sigma2_diff on the sorted dy (finite,
+        # well-defined even in the exact-linear case).
+        idx_early = np.argsort(d_arr, kind="stable")
+        sigma2_diff_exact = float(np.sum(np.diff(dy_arr[idx_early]) ** 2) / (2.0 * G))
+        return YatchewTestResults(
+            t_stat_hr=float("-inf"),
+            p_value=1.0,
+            reject=False,
+            alpha=alpha,
+            critical_value=critical_value,
+            sigma2_lin=sigma2_lin,
+            sigma2_diff=sigma2_diff_exact,
+            sigma2_W=0.0,
+            n_obs=G,
+        )
+
+    idx = np.argsort(d_arr, kind="stable")
+    dy_s = dy_arr[idx]
+    eps_s = eps[idx]
+
+    diff_dy = np.diff(dy_s)  # length G - 1
+    # Paper-literal divisor: 2G (NOT 2(G-1)). This matches paper review
+    # line 168: sigma2_diff := (1/(2G)) * sum((dy_{(g)} - dy_{(g-1)})^2).
+    sigma2_diff = float(np.sum(diff_dy * diff_dy) / (2.0 * G))
+
+    # sigma4_W = (1/(G-1)) * sum(eps_(g)^2 * eps_(g-1)^2) using np.mean
+    # which divides by the length of the input (G-1 here). Matches paper
+    # review line 171.
+    sigma4_W = float(np.mean(eps_s[1:] ** 2 * eps_s[:-1] ** 2))
+    if sigma4_W <= 0.0:
+        # sigma4_W = 0 AFTER the exact-linear short-circuit means OLS
+        # residuals are NOT zero (the shortcut already caught that case)
+        # but every adjacent pair of sorted squared residuals contains a
+        # zero (e.g. residuals alternate zero / nonzero after sort).
+        # The formal test statistic is ±inf depending on the sign of the
+        # numerator ``sigma2_lin - sigma2_diff``; mapping every such case
+        # to p=1 (as an earlier revision did) can flip a legitimate
+        # rejection into a fail-to-reject.
+        warnings.warn(
+            f"yatchew_hr_test: sigma4_W = 0 with non-zero residuals "
+            f"(sigma2_lin = {sigma2_lin:.6g}, sigma2_diff = {sigma2_diff:.6g}); "
+            f"the formal test statistic is infinite. Returning the "
+            f"sign-aware limit decision.",
+            UserWarning,
+            stacklevel=2,
+        )
+        numerator = sigma2_lin - sigma2_diff
+        if numerator > 0.0:
+            # T_hr -> +inf: reject (far into right tail).
+            t_stat_hr_val = float("inf")
+            p_value_val = 0.0
+            reject_val = True
+        elif numerator < 0.0:
+            # T_hr -> -inf: fail-to-reject.
+            t_stat_hr_val = float("-inf")
+            p_value_val = 1.0
+            reject_val = False
+        else:
+            # 0/0: genuinely indeterminate.
+            t_stat_hr_val = float("nan")
+            p_value_val = float("nan")
+            reject_val = False
+        return YatchewTestResults(
+            t_stat_hr=t_stat_hr_val,
+            p_value=p_value_val,
+            reject=reject_val,
+            alpha=alpha,
+            critical_value=critical_value,
+            sigma2_lin=sigma2_lin,
+            sigma2_diff=sigma2_diff,
+            sigma2_W=0.0,
+            n_obs=G,
+        )
+    sigma2_W = float(np.sqrt(sigma4_W))
+
+    t_stat_hr = float(np.sqrt(G) * (sigma2_lin - sigma2_diff) / sigma2_W)
+    p_value = float(1.0 - stats.norm.cdf(t_stat_hr))
+    reject = t_stat_hr >= critical_value
+
+    return YatchewTestResults(
+        t_stat_hr=t_stat_hr,
+        p_value=p_value,
+        reject=bool(reject),
+        alpha=alpha,
+        critical_value=critical_value,
+        sigma2_lin=sigma2_lin,
+        sigma2_diff=sigma2_diff,
+        sigma2_W=sigma2_W,
+        n_obs=G,
+    )
+
+
+def did_had_pretest_workflow(
+    data: pd.DataFrame,
+    outcome_col: str,
+    dose_col: str,
+    time_col: str,
+    unit_col: str,
+    first_treat_col: Optional[str] = None,
+    alpha: float = 0.05,
+    n_bootstrap: int = 999,
+    seed: Optional[int] = None,
+) -> HADPretestReport:
+    """Run a PARTIAL HAD pre-test workflow on a two-period panel
+    (paper Section 4.2-4.3, steps 1 and 3 only; step 2 deferred).
+
+    Phase 3 scope runs:
+
+    - Step 1: :func:`qug_test` (``H_0: d_lower = 0``, Theorem 4).
+    - Step 3: :func:`stute_test` and :func:`yatchew_hr_test` (linearity
+      of ``E[ΔY | D_2]``, Assumption 8).
+
+    Phase 3 does **NOT** run step 2 (Assumption 7 pre-trends test via
+    paper Equation 18); that joint cross-horizon Stute variant is
+    deferred to a follow-up patch. Users should continue to perform their
+    own pre-trends / placebo analysis until the follow-up ships. The
+    returned :class:`HADPretestReport` verdict explicitly flags the
+    Assumption 7 gap when all implemented diagnostics fail-to-reject, so
+    callers do not receive a misleading "TWFE safe" signal.
+
+    The workflow reduces the panel to unit-level first differences using
+    the Phase 2a validator + aggregator, then calls the three tests with
+    shared ``alpha`` and a single-source seed passthrough (``seed`` is
+    forwarded to :func:`stute_test` only; QUG and Yatchew are
+    deterministic).
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Balanced two-period HAD panel. The dose column must be 0 for all
+        units at the pre-period (HAD no-unit-untreated pre-period
+        contract).
+    outcome_col, dose_col, time_col, unit_col : str
+        Column names.
+    first_treat_col : str or None, default None
+        Optional first-treatment-period column for cross-validation
+        (see :func:`HeterogeneousAdoptionDiD.fit`).
+    alpha : float, default 0.05
+    n_bootstrap : int, default 999
+        Replication count for :func:`stute_test`.
+    seed : int or None, default None
+        Seed forwarded to :func:`stute_test` only.
+
+    Returns
+    -------
+    HADPretestReport
+
+    Notes
+    -----
+    Phase 3 scope is two-period overall-path only. For multi-period
+    panels, slice to ``(F - 1, F)`` before calling. A future patch will
+    add a multi-period dispatch and the joint Equation 18 pre-trend
+    Stute test.
+
+    References
+    ----------
+    de Chaisemartin et al. (2026), Section 4.2-4.3, Theorem 4, Appendix D,
+    Theorem 7.
+    """
+    t_pre, t_post = _validate_had_panel(
+        data, outcome_col, dose_col, time_col, unit_col, first_treat_col
+    )
+    d_arr, dy_arr, _, _ = _aggregate_first_difference(
+        data,
+        outcome_col,
+        dose_col,
+        time_col,
+        unit_col,
+        t_pre,
+        t_post,
+        cluster_col=None,  # pretests do not use cluster-robust SE
+    )
+
+    qug_res = qug_test(d_arr, alpha=alpha)
+    stute_res = stute_test(d_arr, dy_arr, alpha=alpha, n_bootstrap=n_bootstrap, seed=seed)
+    yatchew_res = yatchew_hr_test(d_arr, dy_arr, alpha=alpha)
+
+    # `all_pass` must be conclusive under the paper's four-step workflow
+    # (step 1 QUG + step 3 linearity via Stute OR Yatchew):
+    #   - QUG must produce a finite p-value (step 1 is required).
+    #   - At least ONE of Stute / Yatchew must produce a finite p-value
+    #     (step 3 accepts either; the paper's wording is "Stute OR
+    #     Yatchew"). This accommodates common QUG-style panels with
+    #     repeated d=0 units, where Yatchew's duplicate-dose guard trips
+    #     but Stute's tie-safe CvM still produces a conclusive result.
+    #   - No conclusive test may reject. NaN-p tests have reject=False by
+    #     convention, so the OR across `.reject` naturally counts only
+    #     the conclusive rejections.
+    qug_conclusive = bool(np.isfinite(qug_res.p_value))
+    linearity_conclusive = bool(np.isfinite(stute_res.p_value) or np.isfinite(yatchew_res.p_value))
+    any_reject = qug_res.reject or stute_res.reject or yatchew_res.reject
+    all_pass = bool(qug_conclusive and linearity_conclusive and not any_reject)
+    verdict = _compose_verdict(qug_res, stute_res, yatchew_res)
+
+    return HADPretestReport(
+        qug=qug_res,
+        stute=stute_res,
+        yatchew=yatchew_res,
+        all_pass=all_pass,
+        verdict=verdict,
+        alpha=alpha,
+        n_obs=int(d_arr.shape[0]),
+    )
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index ae629889..6ad71b04 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -2267,7 +2267,7 @@ so `δ_0` is recovered by OLS of `ΔY` on `X` and `D_2 * X`; Average Slope is `(
 3. Report the estimate as WAS_{d̲} under Assumption 6 or as the sign-identifying quantity under Assumption 5.
 
 *Algorithm variant - QUG null test (Theorem 4, Section 3.3):*
-Tuning-parameter-free test of `H_0: d̲ = 0` versus `H_1: d̲ > 0`. Shipped in `diff_diff/diagnostics.py` as `qug_test()`.
+Tuning-parameter-free test of `H_0: d̲ = 0` versus `H_1: d̲ > 0`. Shipped in `diff_diff/had_pretests.py` as `qug_test()`.
 1. Sort `D_{2,g}` ascending to obtain order statistics `D_{2,(1)} ≤ D_{2,(2)} ≤ ... ≤ D_{2,(G)}`.
 2. Compute test statistic `T := D_{2,(1)} / (D_{2,(2)} - D_{2,(1)})`.
 3. Reject `H_0` if `T > 1/α - 1`.
@@ -2276,7 +2276,7 @@ Tuning-parameter-free test of `H_0: d̲ = 0` versus `H_1: d̲ > 0`. Shipped in `
 - **Note:** Implementation is `O(G)` via `np.partition`; no sort required.
 
 *Algorithm variant - TWFE linearity test via Stute (1997) Cramér-von Mises with wild bootstrap (Section 4.3, Appendix D):*
-Shipped in `diff_diff/diagnostics.py` as `stute_test()`. Tests whether `E(ΔY | D_2)` is linear, the testable implication of TWFE's homogeneity assumption (Assumption 8) in HADs.
+Shipped in `diff_diff/had_pretests.py` as `stute_test()`. Tests whether `E(ΔY | D_2)` is linear, the testable implication of TWFE's homogeneity assumption (Assumption 8) in HADs.
 1. Fit linear regression of `ΔY_g` on constant and `D_{g,2}`; collect residuals `ε̂_{lin,g}`.
 2. Form cusum process `c_G(d) := G^{-1/2} Σ_{g=1}^G 1{D_{g,2} ≤ d} · ε̂_{lin,g}`.
 3. Compute Cramér-von Mises statistic `S := (1/G) Σ_{g=1}^G c_G²(D_{g,2})`. Equivalently, after sorting by `D_{g,2}`: `S = Σ_{g=1}^G (g/G)² · ((1/g) Σ_{h=1}^g ε̂_{lin,(h)})²`.
@@ -2288,10 +2288,10 @@ Shipped in `diff_diff/diagnostics.py` as `stute_test()`. Tests whether `E(ΔY |
    - Repeat B times; the p-value is the fraction of `S*` exceeding `S`.
 5. Properties (page 26): asymptotic size, consistency under any fixed alternative, non-trivial local power at rate `G^{-1/2}`.
 6. Vectorized implementation (Appendix D): with `L` a `G × G` lower-triangular matrix of ones, `S = (1/G²) · 1ᵀ (L · E)^{∘2}`. Bootstrap uses a `G × G` realization matrix `H` of Mammen weights; memory-bounded at `G ≈ 100,000`.
-- **Note:** Default `n_bootstrap = 499` is a diff-diff choice; the paper does not prescribe.
+- **Note:** Default `n_bootstrap = 999` is a diff-diff choice; the paper does not prescribe. A front-door `ValueError` is raised for `n_bootstrap < 99` (below which the discretised bootstrap p-value grid `1/(B+1)` is too coarse to be meaningful).
 
 *Algorithm variant - Yatchew (1997) heteroskedasticity-robust linearity test (Appendix E, Theorem 7):*
-Shipped in `diff_diff/diagnostics.py` as `yatchew_hr_test()`. Alternative to Stute when `G` is large or heteroskedasticity is suspected.
+Shipped in `diff_diff/had_pretests.py` as `yatchew_hr_test()`. Alternative to Stute when `G` is large or heteroskedasticity is suspected.
 1. Sort `(D_{g,2}, ΔY_g)` by `D_{g,2}`.
 2. Compute difference-based variance estimator: `σ̂²_{diff} := (1/(2G)) Σ_{g=2}^G [(Y_{2,(g)} - Y_{1,(g)}) - (Y_{2,(g-1)} - Y_{1,(g-1)})]²`.
 3. Fit linear regression; compute residual variance `σ̂²_{lin}`.
@@ -2301,12 +2301,14 @@ Shipped in `diff_diff/diagnostics.py` as `yatchew_hr_test()`. Alternative to Stu
 7. Inference on `β̂_{fe}` conditional on accepting the linearity test is asymptotically valid (Theorem 7, Point 1; citing de Chaisemartin and D'Haultfœuille 2024 arXiv:2407.03725).
 
 *Four-step pre-testing workflow (Section 4.2-4.3):*
-Shipped as `did_had_pretest_workflow()` and surfaced via `practitioner_next_steps()`. The paper's decision rule for TWFE reliability in HADs:
+Shipped as `did_had_pretest_workflow()` in Phase 3. The paper's decision rule for TWFE reliability in HADs:
 1. Test the null of a QUG (`H_0: d̲ = 0`) using `qug_test()`.
 2. Run a pre-trends test of Assumption 7 (requires a pre-period `t=0`).
 3. Test that `E(ΔY | D_2)` is linear (`stute_test` or `yatchew_hr_test`).
 4. If NONE of the three is rejected, `β̂_{fe}` from TWFE may be used to estimate the treatment effect.
 
+**Phase 3 delivery:** `did_had_pretest_workflow()` runs steps 1 + 3 (QUG + Stute + Yatchew) on a two-period panel and returns a verdict. Step 2 (pre-trends test via Equation 18 joint cross-horizon Stute on pre-period placebos) is deferred to a Phase 3 follow-up patch; see `TODO.md`. The `practitioner_next_steps()` integration is queued for Phase 5.
+
 **Reference implementation(s):**
 - R: `did_had` (de Chaisemartin, Ciccia, D'Haultfœuille, Knau 2024a); `stute_test` (2024c); `yatchew_test` (Online Appendix, Table 3).
 - Stata: `did_had` (2024b); `stute_test` (2024d); `yatchew_test`. Also `twowayfeweights` (de Chaisemartin, D'Haultfœuille, Deeb 2019) for negative-weight diagnostics.
@@ -2332,16 +2334,21 @@ Shipped as `did_had_pretest_workflow()` and surfaced via `practitioner_next_step
     - **Note (CI endpoints):** Because the continuous-path `att` is `(mean(ΔY) - τ̂_bc) / den`, the beta-scale CI endpoints reverse relative to the Phase 1c boundary-limit CI: `CI_lower(β̂) = (mean(ΔY) - CI_upper(τ̂_bc)) / den` and `CI_upper(β̂) = (mean(ΔY) - CI_lower(τ̂_bc)) / den`. The `HeterogeneousAdoptionDiD.fit()` implementation computes `att ± z · se` directly via `safe_inference`, which handles the reversal naturally from the transformed point estimate.
     - **Note (Phase 2a/2b scope):** Phase 2a ships the single-period `aggregate="overall"` path; Phase 2b lifts `aggregate="event_study"` (Appendix B.2 multi-period extension) which returns a `HeterogeneousAdoptionDiDEventStudyResults` with per-event-time WAS estimates and pointwise CIs. `survey=` and `weights=` kwargs raise `NotImplementedError` pointing to the follow-up survey-integration PR.
     - **Note (panel-only):** The paper (Section 2) defines HAD on *panel or repeated cross-section* data, but both the overall and event-study paths ship a panel-only implementation: `HeterogeneousAdoptionDiD.fit()` requires a balanced panel with a unit identifier so that unit-level first differences `ΔY_{g,t} = Y_{g,t} - Y_{g,t_anchor}` can be formed. Repeated-cross-section inputs (disjoint unit IDs between periods) are rejected by the balanced-panel validator. RCS support is queued for a follow-up PR (tracked in `TODO.md`); it will need a separate identification path based on pre/post cell means rather than unit-level differences.
-- [x] Phase 2b: Multi-period event-study extension (Appendix B.2). `aggregate="event_study"` produces per-event-time WAS estimates using a uniform `F-1` baseline (`ΔY_{g,t} = Y_{g,t} - Y_{g,F-1}` for every horizon), reusing the three Phase 2a design paths on per-horizon first differences. Pre-period placebos included for `e <= -2` (the anchor `e = -1` is skipped since `ΔY = 0` trivially). Post-period estimates for `e >= 0`. The joint Stute test (Equation 18) across pre-periods is a SEPARATE diagnostic deferred to Phase 3 (pre-test diagnostics).
+- [x] Phase 2b: Multi-period event-study extension (Appendix B.2). `aggregate="event_study"` produces per-event-time WAS estimates using a uniform `F-1` baseline (`ΔY_{g,t} = Y_{g,t} - Y_{g,F-1}` for every horizon), reusing the three Phase 2a design paths on per-horizon first differences. Pre-period placebos included for `e <= -2` (the anchor `e = -1` is skipped since `ΔY = 0` trivially). Post-period estimates for `e >= 0`. The joint Stute test (Equation 18) across pre-periods is a SEPARATE diagnostic deferred to a **Phase 3 follow-up patch** (Phase 3 ships the single-horizon Stute test; the joint stacked-residual variant is tracked in `TODO.md`).
     - **Note (Phase 2b last-cohort filter):** When `first_treat_col` indicates more than one nonzero cohort, the panel is auto-filtered to the last-treatment cohort (`F_last = max(cohorts)`) **plus never-treated units** (`first_treat = 0`), with a `UserWarning` naming kept/dropped unit counts and dropped cohort labels. Paper Appendix B.2 is explicit that HAD "may be used only for the LAST treatment cohort in a staggered design"; the auto-filter implements this prescription, retaining never-treated units per the paper's "there must be an untreated group, at least till the period where the last cohort gets treated" requirement. Only earlier-cohort units (with `first_treat > 0` and `< F_last`) are dropped — never-treated units satisfy the dose invariant at every period (`D = 0` throughout) and preserve Design 1' identifiability (boundary at `0`) when last-cohort doses are uniformly positive. When `first_treat_col` is omitted on a >2-period panel, the validator infers each unit's first-positive-dose period from the dose path; if multiple distinct first-positive-dose cohorts are detected, the estimator raises a front-door `ValueError` directing users to pass `first_treat_col` (which activates the auto-filter) or use `ChaisemartinDHaultfoeuille` for full staggered support — there is no silent acceptance of staggered panels without cohort metadata. Common-adoption panels (single first-positive-dose cohort, or only never-treated + one cohort) pass through unchanged with `F` inferred from the dose invariant, and require dose contiguity (pre-periods < post-periods in natural ordering). Non-contiguous dose sequences (e.g., reverse treatment) raise with a pointer to `ChaisemartinDHaultfoeuille`.
     - **Note (Phase 2b constant-dose requirement):** The event-study aggregation uses `D_{g, F}` (first-treatment-period dose) as the single regressor for every event-time horizon, per paper Appendix B.2's "once treated, stay treated with the same dose" convention. The validator REJECTS panels where a unit has time-varying dose across post-treatment periods (`D_{g, t} != D_{g, F}` for any `t >= F` within-unit, beyond float tolerance) with a front-door `ValueError`, directing users with genuinely time-varying post-treatment doses to `ChaisemartinDHaultfoeuille` (`did_multiplegt_dyn`). Silent acceptance would misattribute later-horizon treatment-effect heterogeneity to the period-F dose. A follow-up PR could implement a time-varying-dose estimator; tracked in `TODO.md`.
     - **Note (Phase 2b per-horizon SE):** Each event-time horizon uses an INDEPENDENT sandwich computed on that horizon's first differences: continuous paths use the CCT-2014 robust SE from Phase 1c divided by `|den|`; mass-point path uses the structural-residual 2SLS sandwich from Phase 2a. This produces pointwise CIs per horizon, matching the paper's Pierce-Schott application (Section 5.2, Figure 2: "nonparametric pointwise CIs"). Joint cross-horizon covariance (IF-based stacking or block bootstrap) is NOT computed — the paper does not derive it and all reported CIs are pointwise. Follow-up PRs may add joint covariance for cross-horizon hypothesis tests; current tracking in `TODO.md`.
     - **Note (Phase 2b baseline convention):** All event-time horizons use a uniform `F-1` anchor: `ΔY_{g,t} = Y_{g,t} - Y_{g,F-1}` for every `t`. This is consistent with the paper's Garrett-et-al. application (Section 5.1: "outcome `Y_{g,t} - Y_{g,2001}`" where `F = 2002`), simplifies event-time indexing (`e = t - F` so `e = -1` is the anchor, skipped), and keeps the implementation symmetric for pre- and post-period horizons. The paper review text's asymmetric "`Y_{g,t} - Y_{g,1}` for pre" / "`Y_{g,t} - Y_{g,F-1}` for post" phrasing is covered by the uniform convention since both give the same placebo interpretation under parallel trends (the paper's own applications use the uniform anchor).
     - **Note (Phase 2b result class):** `aggregate="event_study"` returns a new `HeterogeneousAdoptionDiDEventStudyResults` dataclass (distinct from the single-period `HeterogeneousAdoptionDiDResults`) with per-horizon arrays (`event_times`, `att`, `se`, `t_stat`, `p_value`, `conf_int_low`, `conf_int_high`, `n_obs_per_horizon`) and shared metadata. `to_dataframe()` returns a tidy per-horizon DataFrame; `to_dict()` returns a dict with list-of-per-horizon fields. The static return-type annotation on `fit()` is `HeterogeneousAdoptionDiDResults` (the common case); callers passing `aggregate="event_study"` should annotate their variable as `HeterogeneousAdoptionDiDEventStudyResults` for type checkers.
-- [ ] Phase 3: `qug_test()` (`T = D_{2,(1)} / (D_{2,(2)} - D_{2,(1)})`, rejection `{T > 1/α - 1}`).
-- [ ] Phase 3: `stute_test()` Cramér-von Mises with Mammen wild bootstrap.
-- [ ] Phase 3: `yatchew_hr_test()` heteroskedasticity-robust linearity test.
-- [ ] Phase 3: `did_had_pretest_workflow()` composite helper.
+    - **Note (Phase 3 pretest workflow):** Phase 3 ships the Section 4 pre-test diagnostics in a new module `diff_diff/had_pretests.py` (separate from the 2,800-line `had.py` estimator module). The three tests — `qug_test`, `stute_test`, `yatchew_hr_test` — accept raw `(d, dy)` arrays and return their own result dataclasses (`QUGTestResults`, `StuteTestResults`, `YatchewTestResults`); the composite `did_had_pretest_workflow` accepts a two-period panel and returns `HADPretestReport` with a priority-ordered verdict string. The `data`-only workflow signature (no `result=`-consuming variant) avoids coupling pretests to the `BiasCorrectedFit` internal state, which does not expose residuals. Below-minimum sample sizes emit `UserWarning` and return all-NaN inference (no `ValueError` raise) for library consistency with the `safe_inference` convention; input-shape violations (non-1D, NaN-containing, mismatched length) still raise. No multiple-testing adjustment (Bonferroni/Holm) is applied — the paper does not prescribe one. **Partial-workflow semantic:** the composite runs paper steps 1 (QUG) + 3 (linearity via Stute + Yatchew-HR) only; step 2 (Assumption 7 pre-trends test via Equation 18) is deferred to a Phase 3 follow-up patch. When all three implemented tests fail-to-reject, the verdict explicitly flags the Assumption 7 gap (`"QUG and linearity diagnostics fail-to-reject; Assumption 7 pre-trends test NOT run (paper step 2 deferred to Phase 3 follow-up)"`) rather than claiming unconditional TWFE safety, so callers do not receive a misleading certification signal. The `HADPretestReport` docstring reinforces this caveat. Users should continue to perform their own pre-trends / placebo analysis until the follow-up ships the joint Equation 18 Stute variant.
+    - **Note (Phase 3 Stute bootstrap):** The Stute CvM statistic is implemented via the simplified-form `S = (1/G²) · Σ cumsum²`, which is algebraically equivalent to the paper's stated `Σ(g/G)² · ((1/g) Σ eps_{(h)})²` form. Bootstrap follows the paper's Appendix D Algorithm literally: each iteration refits OLS on `dy_b = a_hat + b_hat · d + eps · eta` (null DGP multiplied by Mammen weights). The Appendix-D vectorized matrix form (precomputing `M = I - X(X'X)⁻¹X'` once and applying it to `eps · eta` in each iteration) is functionally identical and ~2× faster; it is deferred as a performance follow-up to keep the Phase 3 reviewer surface small. Bootstrap reproducibility uses `np.random.default_rng(seed)` (library convention, matching `wild_bootstrap_se`); `seed=42` in tests for bitwise stability.
+    - **Note (Phase 3 Yatchew normalizer):** `σ̂²_diff` in `yatchew_hr_test` divides by `2G` (paper-literal from Theorem 7), NOT by `2(G-1)`. A unit test hand-computes `σ̂²_diff` at `G=4` on deterministic inputs and asserts the `2G`-normalizer form at `atol=1e-12` so any later regression to the (asymptotically equivalent but finite-sample different) `2(G-1)` form would fail the test. `σ̂⁴_W` uses `np.mean(eps_{(g)}² · eps_{(g-1)}²)` which divides by `G-1` via `np.mean` length, matching the paper's Theorem 7 / Equation 29 normalization.
+    - **Note (Phase 3 Yatchew tie policy):** `yatchew_hr_test` REJECTS duplicate dose values with a `UserWarning` + all-NaN result at the front door. The difference-based variance estimator `σ̂²_diff` and the heteroskedasticity-robust scale `σ̂⁴_W` both use adjacent differences (of sorted dy and of adjacent squared residuals, respectively); under tied doses the within-tie row ordering is arbitrary (stable sort falls back to input order), so the statistic becomes non-methodological and order-dependent. Callers with tied doses (mass-point designs, discretised dose registers) should use `stute_test` instead — its tie-safe Cramer-von Mises statistic collapses tie blocks to the post-tie cumulative sum and is provably order-invariant under within-tie permutations. A regression test on `stute_test` asserts bit-identical `cvm_stat` across tied-dose permutations at `atol=1e-14`; a matching regression on `yatchew_hr_test` asserts the NaN+warning behavior on duplicated and constant doses. This tie policy is a Phase 3 diff-diff choice (the paper describes Yatchew only as "sort by D" and does not specify a tie rule); an alternative implementation could document a justified tie-resolution convention and back it with permutation tests, but at the cost of a methodology surface the paper does not cover. **Workflow fallback:** `did_had_pretest_workflow` follows the paper's "Stute OR Yatchew" step-3 wording — a conclusive Stute (which is tie-safe) is sufficient to adjudicate linearity even when Yatchew returns NaN on tied-dose panels. The composite verdict then reads "QUG and linearity diagnostics fail-to-reject (Yatchew NaN - skipped); ..." rather than forcing the whole report to "inconclusive", and `all_pass` is `True` when QUG + at least one conclusive linearity test fail-to-reject.
+    - **Note (Phase 3 exact-linear short-circuit):** Both `stute_test` and `yatchew_hr_test` detect numerically-exact linear fits (OLS residuals below machine precision relative to the signal) and short-circuit to `p=1.0, reject=False` without running the bootstrap / computing `T_hr`. The detection compares `sum(eps^2)` against the CENTERED total sum of squares `sum((dy - dybar)^2)` (ratio `<= 1e-24`), which is equivalent to `1 - R^2` and is TRANSLATION-INVARIANT under additive shifts in dy. Comparing against uncentered `sum(dy^2)` would NOT be translation-invariant: adding a large constant to dy inflates `sum(dy^2)` and can spuriously trip the short-circuit on genuinely noisy data. Regression tests pin (a) the exact-linear fail-to-reject behavior for `dy = a + b*d`, (b) translation-invariance under `dy -> dy + 1e12` (the short-circuit must not fire on noisy data regardless of the constant offset).
+- [x] Phase 3: `qug_test()` (`T = D_{2,(1)} / (D_{2,(2)} - D_{2,(1)})`, rejection `{T > 1/α - 1}`). One-sided asymptotic p-value `1 / (1 + T)` under the Exp(1)/Exp(1) limit law (Theorem 4). Zero-dose observations filtered upfront (with `UserWarning`); `D_{(1)} == D_{(2)}` tie returns all-NaN inference (conservative). Closed-form tight parity tested at `atol=1e-12`.
+- [x] Phase 3: `stute_test()` Cramér-von Mises with Mammen wild bootstrap. Statistic `S = (1/G^2) Σ (cumsum_g)^2` (algebraically equivalent to paper's `Σ(g/G)^2 · ((1/g) Σ eps_{(h)})^2`). Bootstrap follows paper Appendix D Algorithm literal (per-iteration OLS refit). `n_bootstrap=999` default, `n_bootstrap >= 99` validated. `G < 10` returns NaN; `G > 100_000` emits a `UserWarning` pointing to `yatchew_hr_test`. Appendix-D vectorized matrix form deferred as a performance follow-up (tracked in `TODO.md`).
+- [x] Phase 3: `yatchew_hr_test()` heteroskedasticity-robust linearity test. Test statistic `T_hr = sqrt(G) · (σ̂²_lin - σ̂²_diff) / σ̂²_W` from paper Equation 29. Normalizer `σ̂²_diff` divides by `2G` (paper-literal Theorem 7), NOT `2(G-1)`; hand-computed tight parity asserted at `atol=1e-12`. One-sided standard-normal critical value. `G < 3` returns NaN.
+- [x] Phase 3: `did_had_pretest_workflow()` composite helper. Two-period `data`-only entry point (Phase 2a overall-path dispatch); reduces panel via `_aggregate_first_difference` and runs all three IMPLEMENTED tests at a shared `alpha`. `seed` forwards to `stute_test` only (QUG and Yatchew are deterministic). Returns `HADPretestReport` with priority-ordered verdict string. Because Phase 3 ships steps 1 + 3 of the paper's four-step workflow but **not** step 2 (Assumption 7 pre-trends test via Equation 18), the fail-to-reject verdict explicitly flags the Assumption 7 gap rather than claiming unconditional TWFE safety: `"QUG and linearity diagnostics fail-to-reject; Assumption 7 pre-trends test NOT run (paper step 2 deferred to Phase 3 follow-up)"`. Verdict priority follows the paper's one-way rule (TWFE admissible only if NO test rejects): **conclusive rejections are the primary verdict and are NEVER hidden by inconclusive status** — any unresolved-step note is appended via `"; additional steps unresolved: ..."` rather than replacing the rejection. The pure `"inconclusive - QUG NaN"` / `"inconclusive - both Stute and Yatchew linearity tests NaN"` forms only fire when NO conclusive test rejects AND a required step is unresolved. The partial-workflow fail-to-reject verdict may carry a `"(Yatchew NaN - skipped)"` (or Stute) suffix when one linearity test is NaN but the other is conclusive (step 3 resolved via the paper's "Stute OR Yatchew" wording). Bundled rejection-reason strings name each failed assumption in the conclusive-rejection case. `all_pass` is `True` iff QUG is conclusive AND at least one of Stute/Yatchew is conclusive AND no conclusive test rejects. **Non-negative-dose contract**: all three raw linearity helpers (`qug_test`, `stute_test`, `yatchew_hr_test`) raise a front-door `ValueError` on any `d < 0`, mirroring the `_validate_had_panel` guard (paper Section 2 HAD support restriction). Multi-period panels pre-slice to `(F-1, F)` before calling; joint-horizon dispatch deferred to Phase 3 follow-up.
 - [ ] Phase 4: Pierce-Schott (2016) replication harness reproduces Figure 2 values.
 - [ ] Phase 4: Full DGP 1/2/3 coverage-rate reproduction from Table 1.
 - [ ] Phase 5: `practitioner_next_steps()` integration for HAD results.
diff --git a/docs/methodology/papers/dechaisemartin-2026-review.md b/docs/methodology/papers/dechaisemartin-2026-review.md
index 0dce9765..15c62eee 100644
--- a/docs/methodology/papers/dechaisemartin-2026-review.md
+++ b/docs/methodology/papers/dechaisemartin-2026-review.md
@@ -183,14 +183,15 @@ Alternative to Stute when `G` is large or heteroskedasticity is suspected.
 - [ ] Separate code paths for Design 1' (`d̲ = 0`), Design 1 mass-point (`d̲ > 0` discrete), and Design 1 continuous-near-`d̲`.
 - [ ] Local-linear regression backend (kernel weights, bandwidth selector).
 - [ ] Integration with bias-corrected CI from Calonico-Cattaneo-Farrell.
-- [ ] QUG null test (`T = D_{2,(1)} / (D_{2,(2)} - D_{2,(1)})`, rejection region `{T > 1/α - 1}`).
-- [ ] Stute Cramér-von Mises test with Mammen wild bootstrap.
-- [ ] Yatchew heteroskedasticity-robust linearity test.
+- [x] QUG null test (`T = D_{2,(1)} / (D_{2,(2)} - D_{2,(1)})`, rejection region `{T > 1/α - 1}`). **Phase 3 implementation (2026-04):** `qug_test()` in `diff_diff/had_pretests.py`. Asymptotic p-value `1/(1+T)` under Exp(1)/Exp(1) limit law. Zero-dose observations filtered upfront with `UserWarning`; tie-break `D_{(1)} == D_{(2)}` returns all-NaN inference. Tight closed-form parity at `atol=1e-12`.
+- [x] Stute Cramér-von Mises test with Mammen wild bootstrap. **Phase 3 implementation (2026-04):** `stute_test()` in `diff_diff/had_pretests.py`. Literal per-iteration OLS refit per paper Appendix D Algorithm. `n_bootstrap=999` default, `n_bootstrap >= 99` validated. Single-horizon only; joint Equation 18 cross-horizon Stute (Paper Section 5.2 Pierce-Schott application) deferred to a Phase 3 follow-up patch.
+- [x] Yatchew heteroskedasticity-robust linearity test. **Phase 3 implementation (2026-04):** `yatchew_hr_test()` in `diff_diff/had_pretests.py`. Test statistic `T_hr = sqrt(G)·(σ²_lin - σ²_diff)/σ²_W` from paper Equation 29. `σ²_diff` normalizes by `2G` (paper-literal), NOT `2(G-1)` (finite-sample equivalent but tests pin the paper-literal form). Standard-normal critical value, one-sided.
+- [x] Composite workflow `did_had_pretest_workflow()` (paper Section 4.2-4.3). **Phase 3 implementation (2026-04):** Two-period panel entry point runs all three tests and returns `HADPretestReport` with priority-ordered verdict string. The paper's step 2 (pre-trends test of Assumption 7) requires Equation 18 and is deferred to the Phase 3 follow-up patch.
 - [ ] Warnings for staggered treatment timing (direct users to existing `ChaisemartinDHaultfoeuille` in diff-diff).
 - [ ] Warnings for extensive-margin effects / positive mass of untreated (not fatal; suggests running existing DiD).
 - [ ] Documentation of non-testability of Assumptions 5 and 6.
 - [x] Multi-period event-study extension (Appendix B.2). **Phase 2b implementation (2026-04):** `aggregate="event_study"` returns per-event-time WAS estimates using uniform `F-1` anchor. Staggered timing auto-filtered to last cohort with `UserWarning` per Appendix B.2 prescription. Pointwise CIs per horizon (no joint cross-horizon covariance; matches paper's Pierce-Schott Figure 2). Pre-period placebos at `e <= -2`; the anchor `e = -1` is skipped since `ΔY = 0` there by construction.
-- [ ] Joint Stute test (Equation 18) across pre-periods. Deferred to Phase 3 (pre-test diagnostics).
+- [ ] Joint Stute test (Equation 18) across pre-periods. Deferred to a **Phase 3 follow-up patch** — Phase 3 (2026-04) shipped the single-horizon Stute test but not the joint cross-horizon variant; the exact stacked-residual formula needs extraction from the paper PDF (not reproduced in this review). Tracked in `TODO.md`.
 
 ---
 
diff --git a/tests/test_had_pretests.py b/tests/test_had_pretests.py
new file mode 100644
index 00000000..a47e5991
--- /dev/null
+++ b/tests/test_had_pretests.py
@@ -0,0 +1,1114 @@
+"""Tests for HAD Phase 3 pre-test diagnostics (had_pretests.py)."""
+
+from __future__ import annotations
+
+import json
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from diff_diff import (
+    QUGTestResults,
+    StuteTestResults,
+    YatchewTestResults,
+    did_had_pretest_workflow,
+    qug_test,
+    stute_test,
+    yatchew_hr_test,
+)
+from diff_diff.had_pretests import _compose_verdict
+
+# =============================================================================
+# Helpers
+# =============================================================================
+
+
+def _make_two_period_panel(
+    G: int,
+    d: np.ndarray,
+    dy: np.ndarray,
+    seed: int = 42,
+) -> pd.DataFrame:
+    """Construct a two-period panel satisfying the HAD overall-path contract.
+
+    - ``time == 0``: ``d = 0`` for all units (HAD no-unit-untreated pre).
+    - ``time == 1``: dose ``d`` per unit, outcome ``y_pre + dy``.
+    """
+    rng = np.random.default_rng(seed)
+    y_pre = rng.normal(0.0, 1.0, size=G)
+    pre = pd.DataFrame({"unit": np.arange(G), "time": 0, "y": y_pre, "d": np.zeros(G)})
+    post = pd.DataFrame({"unit": np.arange(G), "time": 1, "y": y_pre + dy, "d": d})
+    return pd.concat([pre, post], ignore_index=True)
+
+
+def _linear_dgp(G: int, beta: float = 2.0, sigma: float = 0.3, seed: int = 42):
+    """Return ``(d, dy)`` for a linear DGP ``dy = beta * d + eps``."""
+    rng = np.random.default_rng(seed)
+    d = rng.uniform(0.0, 1.0, size=G)
+    dy = beta * d + rng.normal(0.0, sigma, size=G)
+    return d, dy
+
+
+def _quadratic_dgp(
+    G: int, beta: float = 2.0, gamma: float = 5.0, sigma: float = 0.3, seed: int = 42
+):
+    """Return ``(d, dy)`` for a quadratic DGP ``dy = beta*d + gamma*d^2 + eps``."""
+    rng = np.random.default_rng(seed)
+    d = rng.uniform(0.1, 1.0, size=G)
+    dy = beta * d + gamma * d * d + rng.normal(0.0, sigma, size=G)
+    return d, dy
+
+
+# =============================================================================
+# QUG test
+# =============================================================================
+
+
+class TestQUGTest:
+    """Tests for :func:`qug_test`."""
+
+    def test_tight_parity_closed_form(self):
+        """Commit criterion 4: [0.2, 0.5, 0.9] -> T = 0.2/0.3 at atol=1e-12."""
+        d = np.array([0.2, 0.5, 0.9])
+        r = qug_test(d, alpha=0.05)
+        expected_T = 0.2 / (0.5 - 0.2)
+        expected_p = 1.0 / (1.0 + expected_T)
+        np.testing.assert_allclose(r.t_stat, expected_T, atol=1e-12)
+        np.testing.assert_allclose(r.p_value, expected_p, atol=1e-12)
+        assert r.critical_value == pytest.approx(1.0 / 0.05 - 1.0, rel=1e-12)
+
+    def test_tie_break(self):
+        """Commit criterion 3: D_(1) == D_(2) -> NaN, reject=False."""
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = qug_test(np.array([0.3, 0.3, 0.5]))
+            assert len(caught) == 1
+            assert "D_(1) == D_(2)" in str(caught[0].message)
+        assert np.isnan(r.t_stat)
+        assert np.isnan(r.p_value)
+        assert r.reject is False
+        assert r.d_order_1 == 0.3
+        assert r.d_order_2 == 0.3
+
+    def test_zero_filter_emits_warning(self):
+        """Zero-dose observations are excluded with a UserWarning."""
+        d = np.array([0.0, 0.0, 0.3, 0.5, 0.9])
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = qug_test(d)
+            messages = [str(w.message) for w in caught]
+        assert any("excluded 2" in m for m in messages)
+        assert r.n_obs == 3
+        assert r.n_excluded_zero == 2
+        # Should reduce to [0.3, 0.5, 0.9]: T = 0.3 / 0.2 = 1.5
+        np.testing.assert_allclose(r.t_stat, 0.3 / 0.2, atol=1e-12)
+
+    def test_rejects_on_shifted_beta(self):
+        """Commit criterion 2 (single-realization): shifted-Beta -> reject."""
+        rng = np.random.default_rng(42)
+        # Support is [0.2, 1.2]: infimum is well away from zero.
+        d = 0.2 + rng.beta(2, 2, size=200)
+        r = qug_test(d, alpha=0.05)
+        # T = D_(1)/(D_(2) - D_(1)). D_(1) ~ 0.2; gap ~ 0.001 at G=200.
+        # So T is large -> reject.
+        assert r.reject is True
+        assert r.p_value < 0.05
+
+    def test_does_not_reject_on_uniform_support_zero(self):
+        """Commit criterion 1 (single-realization): Uniform(0,1) -> usually no reject."""
+        rng = np.random.default_rng(42)
+        d = rng.uniform(0.0, 1.0, size=500)
+        r = qug_test(d, alpha=0.05)
+        # Uniform support has infimum at 0 (asymptotic). For G=500,
+        # D_(1) is small but D_(2) - D_(1) also small -> T ~ O(1).
+        # Not a guaranteed non-reject at single seed, but this seed passes.
+        assert r.p_value > 0.05
+        assert r.reject is False
+
+    def test_reject_rate_bounded_by_alpha_on_uniform_null(self):
+        """Commit criterion 1: asymptotic size alpha is an UPPER bound.
+
+        Paper Theorem 4 establishes ``lim sup_{G->inf} P(reject | H_0) = alpha``
+        (one-sided asymptotic size bound). At finite G the test is often
+        conservative (actual rate below alpha). We verify the asymptotic
+        size control: reject rate MUST NOT exceed alpha by more than 0.03.
+        """
+        alpha = 0.05
+        n_reps = 200
+        rejections = 0
+        rng = np.random.default_rng(12345)
+        for _ in range(n_reps):
+            d = rng.uniform(0.0, 1.0, size=500)
+            r = qug_test(d, alpha=alpha)
+            if r.reject:
+                rejections += 1
+        rate = rejections / n_reps
+        assert rate <= alpha + 0.03, (
+            f"QUG reject rate {rate:.3f} exceeds alpha={alpha} by more than "
+            f"0.03; asymptotic size control violated."
+        )
+
+    def test_invalid_alpha_raises(self):
+        with pytest.raises(ValueError, match="alpha must satisfy"):
+            qug_test(np.array([0.1, 0.5]), alpha=0.0)
+        with pytest.raises(ValueError, match="alpha must satisfy"):
+            qug_test(np.array([0.1, 0.5]), alpha=1.0)
+
+    def test_nan_input_raises(self):
+        with pytest.raises(ValueError, match="contains NaN"):
+            qug_test(np.array([0.1, np.nan, 0.5]))
+
+    def test_2d_input_raises(self):
+        with pytest.raises(ValueError, match="must be 1-dimensional"):
+            qug_test(np.array([[0.1, 0.5], [0.3, 0.9]]))
+
+    def test_negative_dose_raises(self):
+        """P2 fix: HAD doses must be non-negative; qug_test rejects negatives
+        at the front door rather than silently folding them into the
+        zero-exclusion counter."""
+        with pytest.raises(ValueError, match="negative value"):
+            qug_test(np.array([0.1, -0.3, 0.5, 0.9]))
+
+
+class TestNegativeDoseGuardsOnLinearityTests:
+    """R6 P1 fix: stute_test and yatchew_hr_test must enforce the same
+    HAD non-negative-dose contract as qug_test and the panel validator."""
+
+    def test_stute_negative_dose_raises(self):
+        d = np.linspace(-0.2, 0.8, 30)  # contains negatives
+        dy = 2.0 * d + np.random.default_rng(42).normal(0.0, 0.1, size=30)
+        with pytest.raises(ValueError, match="negative value"):
+            stute_test(d, dy, n_bootstrap=199, seed=42)
+
+    def test_yatchew_negative_dose_raises(self):
+        d = np.linspace(-0.2, 0.8, 30)
+        dy = 2.0 * d + np.random.default_rng(42).normal(0.0, 0.1, size=30)
+        with pytest.raises(ValueError, match="negative value"):
+            yatchew_hr_test(d, dy)
+
+
+# =============================================================================
+# Stute test
+# =============================================================================
+
+
+class TestStuteTest:
+    """Tests for :func:`stute_test`."""
+
+    def test_does_not_reject_linear_dgp(self):
+        """Commit criterion 5 (single-realization): linear -> no reject."""
+        d, dy = _linear_dgp(G=200, beta=2.0, sigma=0.3, seed=42)
+        r = stute_test(d, dy, alpha=0.05, n_bootstrap=199, seed=42)
+        assert r.reject is False
+        assert r.p_value > 0.05
+
+    def test_rejects_quadratic_dgp(self):
+        """Commit criterion 6: quadratic DGP -> reject."""
+        d, dy = _quadratic_dgp(G=200, beta=2.0, gamma=5.0, sigma=0.3, seed=42)
+        r = stute_test(d, dy, alpha=0.05, n_bootstrap=199, seed=42)
+        assert r.reject is True
+        assert r.p_value < 0.05
+
+    def test_cvm_statistic_manual_equivalence(self):
+        """Verify the simplified CvM form matches the paper's stated form
+        in the no-ties case.
+
+        Paper: S = Σ(g/G)^2 · ((1/g) Σ eps_(h))^2
+        Code (no ties):  S = (1/G^2) Σ (cumsum_g)^2
+        These are algebraically identical when all dose values are unique.
+        """
+        from diff_diff.had_pretests import _cvm_statistic
+
+        eps = np.array([1.0, -0.5, 0.3, -0.2, 0.1])
+        d_sorted = np.array([0.1, 0.2, 0.3, 0.4, 0.5])  # all unique -> no ties
+        G = len(eps)
+
+        # Paper form
+        cumsum = np.cumsum(eps)
+        paper_S = 0.0
+        for g in range(1, G + 1):
+            c_g = cumsum[g - 1]
+            paper_S += (g / G) ** 2 * (c_g / g) ** 2
+
+        code_S = _cvm_statistic(eps, d_sorted)
+        np.testing.assert_allclose(code_S, paper_S, atol=1e-14)
+
+    def test_cvm_statistic_tie_safe_order_invariance(self):
+        """CRITICAL P1 fix: tie-safe CvM is order-invariant within tie blocks.
+
+        Under the paper definition, at a tied dose ``D_g == D_{g+1}``, the
+        cusum ``c_G`` evaluated at both tied observations includes ALL
+        tie-block members. So the CvM statistic must not depend on the
+        within-tie ordering of residuals. A naive per-observation cumsum
+        (without tie-block collapse) would give different S values when
+        the within-tie residual order is permuted.
+        """
+        from diff_diff.had_pretests import _cvm_statistic
+
+        # 6 observations with two tie blocks: d = [0.1, 0.1, 0.1, 0.5, 0.5, 0.9]
+        d_sorted = np.array([0.1, 0.1, 0.1, 0.5, 0.5, 0.9])
+        eps_a = np.array([1.0, -0.5, 0.3, 0.7, -0.2, 0.1])
+        # Permute within-tie residual order (positions 0-2 and 3-4 permuted):
+        eps_b = np.array([0.3, 1.0, -0.5, -0.2, 0.7, 0.1])
+        S_a = _cvm_statistic(eps_a, d_sorted)
+        S_b = _cvm_statistic(eps_b, d_sorted)
+        np.testing.assert_allclose(S_a, S_b, atol=1e-14)
+
+    def test_stute_order_invariance_with_duplicate_doses(self):
+        """End-to-end P1 fix: stute_test on duplicate-dose inputs is invariant
+        to within-tie row ordering (tie-safe CvM contract propagates)."""
+        G = 40
+        # Build d with ties and a matched dy
+        rng = np.random.default_rng(42)
+        d_unique = np.array([0.1, 0.3, 0.5, 0.8])
+        d = np.repeat(d_unique, G // len(d_unique))
+        dy = 2.0 * d + rng.normal(0.0, 0.1, size=G)
+        # Permute order within each tie block
+        perm = np.arange(G)
+        rng_perm = np.random.default_rng(123)
+        for start in range(0, G, G // len(d_unique)):
+            block = perm[start : start + G // len(d_unique)]
+            rng_perm.shuffle(block)
+            perm[start : start + G // len(d_unique)] = block
+        r_a = stute_test(d, dy, n_bootstrap=199, seed=42)
+        r_b = stute_test(d[perm], dy[perm], n_bootstrap=199, seed=42)
+        # Observed cvm_stat must be bit-identical (tie-safe).
+        np.testing.assert_allclose(r_a.cvm_stat, r_b.cvm_stat, atol=1e-14)
+
+    def test_n_bootstrap_below_99_raises(self):
+        """Commit criterion 8: n_bootstrap < 99 -> ValueError."""
+        d, dy = _linear_dgp(G=50)
+        with pytest.raises(ValueError, match=r"n_bootstrap must be >= 99"):
+            stute_test(d, dy, n_bootstrap=50)
+
+    def test_exact_linear_returns_p1_not_nan(self):
+        """P1 fix: exact linear fit (eps=0) must fail-to-reject with p=1,
+        NOT return NaN. Assumption 8 holds exactly in this case."""
+        G = 50
+        d = np.linspace(0.1, 1.0, G)
+        dy = 1.0 + 2.0 * d  # exact linear, residuals are zero
+        r = stute_test(d, dy, n_bootstrap=199, seed=42)
+        assert np.isfinite(r.p_value)
+        assert r.p_value == 1.0  # all bootstrap S_b tied at 0 with observed S
+        assert r.reject is False
+        assert r.cvm_stat == 0.0
+
+    def test_exact_linear_shortcut_does_not_fire_on_shifted_noisy_data(self):
+        """P0 fix (R2): the exact-linear short-circuit must NOT fire on
+        noisy data after an additive shift. The fix scales against
+        ``sum((dy - dybar)^2)`` (centered TSS, translation-invariant).
+
+        Exact numerical equivalence of the two bootstrap runs is NOT
+        expected (FP cancellation at 1e12 scale costs ~4 digits of
+        precision), but the short-circuit behavior and decision MUST
+        match.
+        """
+        rng = np.random.default_rng(42)
+        G = 50
+        d = rng.uniform(0.0, 1.0, size=G)
+        dy = 2.0 * d + rng.normal(0.0, 1.0, size=G)
+        dy_shifted = dy + 1e12
+
+        r_raw = stute_test(d, dy, n_bootstrap=199, seed=42)
+        r_shift = stute_test(d, dy_shifted, n_bootstrap=199, seed=42)
+
+        assert r_raw.cvm_stat > 0.0, "shortcut fired on raw noisy data"
+        assert r_shift.cvm_stat > 0.0, "shortcut fired on shifted noisy data"
+        assert r_raw.reject == r_shift.reject
+        np.testing.assert_allclose(r_raw.p_value, r_shift.p_value, rtol=5e-3)
+
+    def test_exact_linear_shortcut_does_not_fire_on_rescaled_noisy_data(self):
+        """P0 fix (R5): the exact-linear short-circuit must NOT fire on
+        noisy data after a MULTIPLICATIVE rescaling. An earlier revision
+        used ``max(centered_TSS, 1.0)`` as the denominator in the guard,
+        which broke scale invariance: scaling ``dy`` by ``1e-12`` would
+        make ``centered_TSS ~ 1e-24`` but the floor would hold the
+        threshold at 1.0, firing the shortcut on noisy data that should
+        NOT trigger it. Fix uses a purely relative comparison with a
+        separate branch for the ``centered_TSS == 0`` edge case.
+        """
+        rng = np.random.default_rng(42)
+        G = 50
+        d = rng.uniform(0.0, 1.0, size=G)
+        dy = 2.0 * d + rng.normal(0.0, 1.0, size=G)
+        dy_scaled = dy * 1e-12  # multiplicative rescale
+
+        r_raw = stute_test(d, dy, n_bootstrap=199, seed=42)
+        r_scaled = stute_test(d, dy_scaled, n_bootstrap=199, seed=42)
+
+        # Neither run may hit the short-circuit.
+        assert r_raw.cvm_stat > 0.0, "shortcut fired on raw noisy data"
+        assert r_scaled.cvm_stat > 0.0, "shortcut fired on scaled noisy data"
+        # Decision must be preserved: rescaling dy cannot change the
+        # rejection outcome (the statistic is scale-invariant in dy).
+        assert r_raw.reject == r_scaled.reject
+
+    def test_mismatched_lengths_raise(self):
+        with pytest.raises(ValueError, match="same length"):
+            stute_test(np.array([0.1, 0.2, 0.3]), np.array([1.0, 2.0]))
+
+    def test_nan_input_raises(self):
+        d, dy = _linear_dgp(G=50)
+        dy_nan = dy.copy()
+        dy_nan[5] = np.nan
+        with pytest.raises(ValueError, match="contains NaN"):
+            stute_test(d, dy_nan)
+
+
+# =============================================================================
+# Yatchew-HR test
+# =============================================================================
+
+
+class TestYatchewHRTest:
+    """Tests for :func:`yatchew_hr_test`."""
+
+    def test_does_not_reject_linear_dgp(self):
+        """Commit criterion 9 (single-realization): linear -> no reject."""
+        d, dy = _linear_dgp(G=200, beta=2.0, sigma=0.3, seed=42)
+        r = yatchew_hr_test(d, dy, alpha=0.05)
+        assert r.reject is False
+
+    def test_rejects_quadratic_dgp(self):
+        """Commit criterion 9 (single-realization): quadratic -> reject."""
+        d, dy = _quadratic_dgp(G=200, beta=2.0, gamma=5.0, sigma=0.3, seed=42)
+        r = yatchew_hr_test(d, dy, alpha=0.05)
+        assert r.reject is True
+        assert r.t_stat_hr > r.critical_value
+
+    def test_sigma2_diff_normalizer_is_2G_paper_literal(self):
+        """Commit criterion 10: sigma2_diff divides by 2G (paper-literal).
+
+        Hand-compute sigma2_diff on a known input and verify the
+        implementation divides by 2G, NOT 2(G-1).
+        """
+        # G = 4 deterministic inputs; sort by d first.
+        d = np.array([0.1, 0.2, 0.3, 0.4])
+        dy = np.array([1.0, 3.0, 2.0, 5.0])
+        r = yatchew_hr_test(d, dy, alpha=0.05)
+
+        # Sorted by d: (already sorted)
+        # diff_dy = [dy[1]-dy[0], dy[2]-dy[1], dy[3]-dy[2]]
+        #         = [2, -1, 3]; squared = [4, 1, 9]; sum = 14
+        # Paper-literal: sigma2_diff = 14 / (2 * 4) = 1.75
+        # WRONG form (using G-1=3): 14 / (2 * 3) = 2.333...
+        sum_sq_diff = 14.0
+        G = 4
+        expected_paper_literal = sum_sq_diff / (2.0 * G)
+        expected_wrong_form = sum_sq_diff / (2.0 * (G - 1))
+        np.testing.assert_allclose(r.sigma2_diff, expected_paper_literal, atol=1e-12)
+        assert (
+            abs(r.sigma2_diff - expected_wrong_form) > 1e-6
+        ), "sigma2_diff should divide by 2G (paper-literal), NOT 2(G-1)."
+
+    def test_invalid_alpha_raises(self):
+        d, dy = _linear_dgp(G=50)
+        with pytest.raises(ValueError, match="alpha must satisfy"):
+            yatchew_hr_test(d, dy, alpha=-0.1)
+
+    def test_exact_linear_returns_p1_not_nan(self):
+        """P1 fix: exact linear fit (eps=0) must fail-to-reject with p=1,
+        NOT NaN. Yatchew statistic is formally -inf (finite-negative numerator
+        over zero denominator), which corresponds to p=1 under the one-sided
+        standard-normal critical value."""
+        G = 50
+        d = np.linspace(0.1, 1.0, G)
+        dy = 1.0 + 2.0 * d  # exact linear, residuals are zero
+        r = yatchew_hr_test(d, dy)
+        assert np.isfinite(r.p_value)
+        assert r.p_value == 1.0
+        assert r.reject is False
+        # t_stat_hr is formally -inf for the exact-linear limit.
+        assert r.t_stat_hr == float("-inf")
+
+    def test_exact_linear_shortcut_does_not_fire_on_shifted_noisy_data(self):
+        """P0 fix (R2): Yatchew exact-linear short-circuit is translation-
+        invariant (comparison against centered TSS, not raw sum(dy^2))."""
+        rng = np.random.default_rng(42)
+        G = 50
+        d = rng.uniform(0.0, 1.0, size=G)
+        dy = 2.0 * d + rng.normal(0.0, 1.0, size=G)
+        dy_shifted = dy + 1e12
+
+        r_raw = yatchew_hr_test(d, dy)
+        r_shift = yatchew_hr_test(d, dy_shifted)
+
+        assert np.isfinite(r_raw.t_stat_hr), "shortcut fired on raw noisy data"
+        assert np.isfinite(r_shift.t_stat_hr), "shortcut fired on shifted noisy data"
+        assert r_raw.reject == r_shift.reject
+        np.testing.assert_allclose(r_raw.t_stat_hr, r_shift.t_stat_hr, rtol=1e-3)
+
+    def test_exact_linear_shortcut_does_not_fire_on_rescaled_noisy_data(self):
+        """P0 fix (R5): Yatchew exact-linear short-circuit is scale-
+        invariant. An earlier revision used a ``max(centered_TSS, 1.0)``
+        floor that broke scale invariance under multiplicative rescaling
+        of dy (e.g. ``dy * 1e-12``). Fix removes the floor and handles
+        the zero-centered-TSS case in a separate branch."""
+        rng = np.random.default_rng(42)
+        G = 50
+        d = rng.uniform(0.0, 1.0, size=G)
+        dy = 2.0 * d + rng.normal(0.0, 1.0, size=G)
+        dy_scaled = dy * 1e-12
+
+        r_raw = yatchew_hr_test(d, dy)
+        r_scaled = yatchew_hr_test(d, dy_scaled)
+
+        # Neither run may short-circuit to t_stat_hr = -inf.
+        assert np.isfinite(r_raw.t_stat_hr), "shortcut fired on raw noisy data"
+        assert np.isfinite(r_scaled.t_stat_hr), "shortcut fired on scaled noisy data"
+        # Decision preserved: Yatchew is scale-invariant in dy.
+        assert r_raw.reject == r_scaled.reject
+        # T_hr is scale-invariant: sigma2_lin and sigma2_diff both scale
+        # by c^2, sigma2_W scales by c^2, so the ratio is unchanged.
+        np.testing.assert_allclose(r_raw.t_stat_hr, r_scaled.t_stat_hr, rtol=1e-3)
+
+    def test_duplicate_doses_return_nan(self):
+        """P1 fix: yatchew_hr_test rejects duplicate doses with UserWarning
+        + NaN result. Adjacent differences depend on within-tie row order,
+        which is non-methodological; users with tied doses should use
+        stute_test (tie-safe CvM)."""
+        d = np.array([0.1, 0.2, 0.2, 0.5, 0.8])  # one duplicate at 0.2
+        dy = np.array([1.0, 2.0, 2.5, 3.0, 4.0])
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = yatchew_hr_test(d, dy)
+            msgs = [str(w.message) for w in caught]
+        assert any("duplicate" in m for m in msgs)
+        assert np.isnan(r.t_stat_hr)
+        assert np.isnan(r.p_value)
+        assert r.reject is False
+
+    def test_constant_d_returns_nan(self):
+        """P1 fix: yatchew_hr_test rejects constant d (degenerate case of
+        duplicate doses: all observations tied)."""
+        d = np.full(10, 0.5)
+        dy = np.linspace(0.0, 2.0, 10)
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = yatchew_hr_test(d, dy)
+            msgs = [str(w.message) for w in caught]
+        assert any("duplicate" in m for m in msgs)
+        assert np.isnan(r.t_stat_hr)
+        assert r.reject is False
+
+    def test_sigma4_W_zero_with_positive_numerator_rejects(self):
+        """P0 fix: when sigma4_W = 0 AFTER the exact-linear shortcut
+        (i.e., residuals are NOT all zero but adjacent-residual-product
+        sums vanish), the Yatchew statistic is formally +inf or -inf
+        depending on numerator sign. A unit-dose, non-exact-linear
+        input where residuals alternate zero/non-zero must NOT be
+        silently mapped to p=1 (fail-to-reject).
+
+        Counterexample from CI R3 reviewer: d=[1,2,3,4,5],
+        dy=[1,0,-2,0,1]. OLS slope = 0, residuals = dy itself,
+        sigma2_lin = 1.2, sigma2_diff = 1.0, sigma4_W = 0 (each
+        adjacent residual product includes a zero factor), and the
+        formal statistic is sqrt(5) * 0.2 / 0 = +inf -> reject=True.
+        Previously the sigma4_W <= 0 branch unconditionally returned
+        p=1, which flipped a legitimate rejection.
+        """
+        d = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        dy = np.array([1.0, 0.0, -2.0, 0.0, 1.0])
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = yatchew_hr_test(d, dy, alpha=0.05)
+            msgs = [str(w.message) for w in caught]
+        # Must NOT silently fail-to-reject.
+        assert not (r.p_value == 1.0 and r.reject is False), (
+            "sigma4_W=0 with positive numerator was silently mapped to "
+            "p=1, reject=False; expected +inf reject path."
+        )
+        # Positive numerator -> +inf statistic, p=0, reject=True.
+        assert r.t_stat_hr == float("inf")
+        assert r.p_value == 0.0
+        assert r.reject is True
+        # The caller should have been warned.
+        assert any("sigma4_W = 0" in m for m in msgs)
+        # Sanity: sigma2_lin and sigma2_diff preserved for inspection.
+        np.testing.assert_allclose(r.sigma2_lin, 1.2, atol=1e-12)
+        np.testing.assert_allclose(r.sigma2_diff, 1.0, atol=1e-12)
+
+
+# =============================================================================
+# Composite workflow
+# =============================================================================
+
+
+class TestCompositeWorkflow:
+    """Tests for :func:`did_had_pretest_workflow`."""
+
+    def test_all_pass_on_linear_flags_assumption7_gap(self):
+        """Commit criterion 11: linear DGP -> all_pass, verdict flags
+        Assumption 7 pre-trends gap per Phase 3 partial-workflow scope."""
+        d, dy = _linear_dgp(G=200, beta=2.0, sigma=0.3, seed=42)
+        panel = _make_two_period_panel(200, d, dy, seed=42)
+        report = did_had_pretest_workflow(
+            panel,
+            outcome_col="y",
+            dose_col="d",
+            time_col="time",
+            unit_col="unit",
+            alpha=0.05,
+            n_bootstrap=199,
+            seed=42,
+        )
+        assert report.all_pass is True
+        # Partial-workflow verdict: explicitly names the Assumption 7 gap
+        # so callers do not receive a misleading "TWFE safe" signal.
+        assert "QUG and linearity diagnostics fail-to-reject" in report.verdict
+        assert "Assumption 7" in report.verdict
+        assert "pre-trends" in report.verdict
+        assert "paper step 2 deferred" in report.verdict
+        # Verdict must NOT claim unconditional TWFE safety.
+        assert "TWFE safe" not in report.verdict
+        assert report.n_obs == 200
+        assert isinstance(report.qug, QUGTestResults)
+        assert isinstance(report.stute, StuteTestResults)
+        assert isinstance(report.yatchew, YatchewTestResults)
+
+    def test_rejects_on_quadratic_plus_shifted_support(self):
+        """Commit criterion 12: quadratic + shifted support -> both reject."""
+        rng = np.random.default_rng(42)
+        G = 200
+        # Shifted-Beta support [0.2, 1.2]: QUG should reject.
+        d = 0.2 + rng.beta(2, 2, size=G)
+        # Quadratic dose response: linearity tests should reject.
+        dy = 2.0 * d + 5.0 * d * d + rng.normal(0, 0.3, size=G)
+        panel = _make_two_period_panel(G, d, dy, seed=42)
+        report = did_had_pretest_workflow(
+            panel,
+            outcome_col="y",
+            dose_col="d",
+            time_col="time",
+            unit_col="unit",
+            alpha=0.05,
+            n_bootstrap=199,
+            seed=42,
+        )
+        assert report.all_pass is False
+        assert "support infimum rejected" in report.verdict
+        assert "linearity rejected" in report.verdict
+        # At least QUG and one of {Stute, Yatchew} rejected.
+        assert report.qug.reject is True
+        assert report.stute.reject or report.yatchew.reject
+
+    def test_workflow_handles_tied_zero_doses_via_stute_fallback(self):
+        """R4 P2 fix: on a common QUG-style panel with repeated d=0
+        (never-treated comparison group), Yatchew returns NaN (duplicate
+        doses) but Stute handles ties correctly. The composite workflow
+        must adjudicate the linearity step via Stute ALONE per the paper's
+        "Stute or Yatchew" step-3 wording, producing a conclusive verdict
+        with a Yatchew-skipped note rather than forcing the whole report
+        to inconclusive.
+
+        Paper review line 298 cites an application with 12 zero-dose
+        units, so this panel shape is not exotic.
+        """
+        rng = np.random.default_rng(42)
+        G = 60
+        # 20 never-treated (d=0 at post), 40 treated with random positive.
+        d = np.zeros(G)
+        d[20:] = rng.uniform(0.1, 1.0, size=40)
+        dy = 2.0 * d + rng.normal(0.0, 0.3, size=G)
+        panel = _make_two_period_panel(G, d, dy, seed=42)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            report = did_had_pretest_workflow(
+                panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42
+            )
+        # Yatchew must NaN (ties from the 20 zero doses).
+        assert np.isnan(report.yatchew.p_value)
+        assert report.yatchew.reject is False
+        # QUG and Stute must be conclusive.
+        assert np.isfinite(report.qug.p_value)
+        assert np.isfinite(report.stute.p_value)
+        # Composite must be conclusive (NOT inconclusive).
+        assert "inconclusive" not in report.verdict
+        # Verdict should note Yatchew was skipped.
+        assert "Yatchew NaN - skipped" in report.verdict
+        # With a linear DGP, all_pass should be True despite Yatchew NaN
+        # (step 3 handled by Stute alone, per paper's "Stute OR Yatchew").
+        assert report.all_pass is True
+
+    def test_all_pass_false_when_any_test_nan(self):
+        """P1 fix: all_pass must NOT be True when any constituent test is
+        NaN. Previously all_pass was computed from `not reject` only, so a
+        NaN-statistic test (reject=False by convention) incorrectly tripped
+        all_pass=True while verdict was "inconclusive"."""
+        G = 40
+        # Constant dose triggers QUG tie (D_(1) == D_(2)) -> NaN.
+        d = np.full(G, 0.5)
+        dy = np.linspace(0.0, 2.0, G)
+        panel = _make_two_period_panel(G, d, dy, seed=42)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            report = did_had_pretest_workflow(
+                panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42
+            )
+        # At least one test produces NaN, so `all_pass` MUST be False even
+        # though none of the tests set reject=True.
+        assert np.isnan(report.qug.p_value)
+        assert report.all_pass is False
+        assert report.verdict.startswith("inconclusive")
+
+    def test_workflow_seed_controls_stute_only(self):
+        """Workflow seed forwards to Stute only; re-run with same seed is reproducible."""
+        d, dy = _linear_dgp(G=100, beta=2.0, sigma=0.3, seed=42)
+        panel = _make_two_period_panel(100, d, dy, seed=42)
+        report_a = did_had_pretest_workflow(
+            panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42
+        )
+        report_b = did_had_pretest_workflow(
+            panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42
+        )
+        assert report_a.stute.p_value == report_b.stute.p_value
+        assert report_a.qug.t_stat == report_b.qug.t_stat
+        assert report_a.yatchew.t_stat_hr == report_b.yatchew.t_stat_hr
+
+
+# =============================================================================
+# NaN propagation
+# =============================================================================
+
+
+class TestNaNPropagation:
+    """All three tests NaN-propagate on degenerate inputs; verdict is inconclusive."""
+
+    def test_constant_dy_trivially_satisfies_linearity(self):
+        """Commit criterion 13 (reframed): constant dy is trivially linear
+        in d; Stute and Yatchew should fail-to-reject with large p-values.
+
+        Constant dy means OLS residuals are mathematically zero and the CvM
+        cusum is zero; the bootstrap yields all-tied ``S_b = 0``. The test
+        correctly fails-to-reject (linearity holds). NaN propagation is
+        covered by :meth:`test_qug_tie_propagates_to_composite` below.
+        """
+        G = 50
+        d = np.linspace(0.1, 1.0, G)
+        dy = np.full(G, 3.0)
+        panel = _make_two_period_panel(G, d, dy, seed=42)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            report = did_had_pretest_workflow(
+                panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42
+            )
+        assert report.stute.reject is False
+        assert report.yatchew.reject is False
+        assert report.stute.p_value > 0.5
+        assert report.yatchew.p_value > 0.5
+
+    def test_qug_tie_propagates_to_composite(self):
+        """QUG tie in composite -> verdict mentions QUG NaN."""
+        G = 20
+        # All units share the same positive dose -> D_(1) == D_(2) tie.
+        d = np.full(G, 0.5)
+        dy = np.linspace(0.0, 2.0, G)
+        panel = _make_two_period_panel(G, d, dy, seed=42)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            report = did_had_pretest_workflow(
+                panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42
+            )
+        assert np.isnan(report.qug.p_value)
+        assert "QUG" in report.verdict
+        assert report.verdict.startswith("inconclusive")
+
+    def test_stute_constant_d_returns_nan(self):
+        G = 50
+        d = np.full(G, 0.5)
+        dy = np.linspace(0.0, 2.0, G)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            r = stute_test(d, dy, n_bootstrap=199, seed=42)
+        assert np.isnan(r.cvm_stat)
+        assert r.reject is False
+
+
+# =============================================================================
+# JSON / DataFrame serialization
+# =============================================================================
+
+
+class TestJSONSerialization:
+    """to_dict() is JSON-safe; to_dataframe() returns correct schema."""
+
+    def test_qug_to_dict_is_json_safe(self):
+        d = np.array([0.2, 0.5, 0.9])
+        r = qug_test(d)
+        json_str = json.dumps(r.to_dict())
+        round_tripped = json.loads(json_str)
+        assert round_tripped["test"] == "qug"
+        assert round_tripped["reject"] is False
+
+    def test_stute_to_dict_is_json_safe(self):
+        d, dy = _linear_dgp(G=50)
+        r = stute_test(d, dy, n_bootstrap=199, seed=42)
+        json.dumps(r.to_dict())  # must not raise
+
+    def test_yatchew_to_dict_is_json_safe(self):
+        d, dy = _linear_dgp(G=50)
+        r = yatchew_hr_test(d, dy)
+        json.dumps(r.to_dict())  # must not raise
+
+    def test_report_to_dict_is_json_safe(self):
+        """Commit criterion 16: json.dumps(report.to_dict()) succeeds."""
+        d, dy = _linear_dgp(G=50)
+        panel = _make_two_period_panel(50, d, dy, seed=42)
+        report = did_had_pretest_workflow(panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42)
+        json_str = json.dumps(report.to_dict())
+        round_tripped = json.loads(json_str)
+        assert "qug" in round_tripped
+        assert "stute" in round_tripped
+        assert "yatchew" in round_tripped
+        assert "verdict" in round_tripped
+
+    def test_report_to_dataframe_schema(self):
+        """Commit criterion 17: 3-row frame with specified columns in order."""
+        d, dy = _linear_dgp(G=50)
+        panel = _make_two_period_panel(50, d, dy, seed=42)
+        report = did_had_pretest_workflow(panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42)
+        df = report.to_dataframe()
+        expected_cols = [
+            "test",
+            "statistic_name",
+            "statistic_value",
+            "p_value",
+            "reject",
+            "alpha",
+            "n_obs",
+        ]
+        assert list(df.columns) == expected_cols
+        assert list(df["test"]) == ["qug", "stute", "yatchew_hr"]
+        assert list(df["statistic_name"]) == ["t_stat", "cvm_stat", "t_stat_hr"]
+        assert len(df) == 3
+
+    def test_per_test_to_dataframe_is_single_row(self):
+        """Commit criterion 18: per-test to_dataframe returns a 1-row frame."""
+        d, dy = _linear_dgp(G=50)
+        r_stute = stute_test(d, dy, n_bootstrap=199, seed=42)
+        r_yatchew = yatchew_hr_test(d, dy)
+        r_qug = qug_test(d)
+        assert len(r_stute.to_dataframe()) == 1
+        assert len(r_yatchew.to_dataframe()) == 1
+        assert len(r_qug.to_dataframe()) == 1
+        assert r_stute.to_dataframe()["test"].iloc[0] == "stute"
+
+
+# =============================================================================
+# Seed reproducibility
+# =============================================================================
+
+
+class TestSeedReproducibility:
+    """Commit criterion 7: stute_test(seed=42) bitwise-reproducible."""
+
+    def test_stute_seed_reproducible(self):
+        d, dy = _linear_dgp(G=100, seed=1)
+        r_a = stute_test(d, dy, n_bootstrap=199, seed=42)
+        r_b = stute_test(d, dy, n_bootstrap=199, seed=42)
+        assert r_a.cvm_stat == r_b.cvm_stat
+        assert r_a.p_value == r_b.p_value  # bitwise
+
+    def test_stute_different_seeds_produce_different_pvals(self):
+        d, dy = _linear_dgp(G=100, seed=1)
+        r_a = stute_test(d, dy, n_bootstrap=199, seed=42)
+        r_b = stute_test(d, dy, n_bootstrap=199, seed=43)
+        # CvM stat is seed-independent; only p-value varies across bootstraps.
+        assert r_a.cvm_stat == r_b.cvm_stat
+        # P-values should differ at B = 199 (grid = 1/200).
+        assert r_a.p_value != r_b.p_value
+
+
+# =============================================================================
+# Sample-size gates
+# =============================================================================
+
+
+class TestSampleSizeGates:
+    """Below-minimum sample sizes emit UserWarning + NaN result (no raise)."""
+
+    def test_qug_below_min_returns_nan(self):
+        """G=1 (after filter) -> NaN, warning, reject=False."""
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = qug_test(np.array([0.5]))
+            msgs = [str(w.message) for w in caught]
+        assert any("at least 2" in m for m in msgs)
+        assert np.isnan(r.t_stat)
+        assert r.reject is False
+
+    def test_qug_all_zero_returns_nan(self):
+        """All d=0 -> filter removes everything -> NaN."""
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            r = qug_test(np.zeros(10))
+        assert np.isnan(r.t_stat)
+        assert r.n_obs == 0
+        assert r.n_excluded_zero == 10
+
+    def test_stute_below_min_returns_nan(self):
+        """Commit criterion 14: stute_test(G=8) -> NaN, warning, no raise."""
+        d, dy = _linear_dgp(G=8)
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = stute_test(d, dy, n_bootstrap=199, seed=42)
+            msgs = [str(w.message) for w in caught]
+        assert any("below the minimum" in m for m in msgs)
+        assert np.isnan(r.cvm_stat)
+        assert r.reject is False
+
+    def test_yatchew_below_min_returns_nan(self):
+        """yatchew_hr_test(G=2) -> NaN, warning."""
+        d = np.array([0.1, 0.5])
+        dy = np.array([1.0, 2.0])
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            r = yatchew_hr_test(d, dy)
+            msgs = [str(w.message) for w in caught]
+        assert any("below the minimum" in m for m in msgs)
+        assert np.isnan(r.t_stat_hr)
+        assert r.reject is False
+
+
+# =============================================================================
+# _compose_verdict priority logic
+# =============================================================================
+
+
+def _mk_qug(reject: bool, p: float = 0.5) -> QUGTestResults:
+    return QUGTestResults(
+        t_stat=10.0 if reject else 0.5,
+        p_value=p,
+        reject=reject,
+        alpha=0.05,
+        critical_value=19.0,
+        n_obs=50,
+        n_excluded_zero=0,
+        d_order_1=0.1,
+        d_order_2=0.2,
+    )
+
+
+def _mk_stute(reject: bool, p: float = 0.5) -> StuteTestResults:
+    return StuteTestResults(
+        cvm_stat=0.5,
+        p_value=p,
+        reject=reject,
+        alpha=0.05,
+        n_bootstrap=999,
+        n_obs=50,
+        seed=42,
+    )
+
+
+def _mk_yatchew(reject: bool, p: float = 0.5) -> YatchewTestResults:
+    return YatchewTestResults(
+        t_stat_hr=3.0 if reject else -0.5,
+        p_value=p,
+        reject=reject,
+        alpha=0.05,
+        critical_value=1.645,
+        sigma2_lin=1.0,
+        sigma2_diff=1.0,
+        sigma2_W=1.0,
+        n_obs=50,
+    )
+
+
+class TestComposeVerdictLogic:
+    """Commit criterion 15: 5 priority-order tests."""
+
+    def test_qug_nan_is_inconclusive(self):
+        """(a1) QUG NaN forces inconclusive (step 1 is required)."""
+        q = _mk_qug(reject=False, p=float("nan"))
+        s = _mk_stute(reject=False)
+        y = _mk_yatchew(reject=False)
+        verdict = _compose_verdict(q, s, y)
+        assert verdict == "inconclusive - QUG NaN"
+
+    def test_both_linearity_nan_is_inconclusive(self):
+        """(a2) When BOTH Stute and Yatchew are NaN, the linearity step
+        cannot be adjudicated and the verdict is inconclusive."""
+        q = _mk_qug(reject=False)
+        s = _mk_stute(reject=False, p=float("nan"))
+        y = _mk_yatchew(reject=False, p=float("nan"))
+        verdict = _compose_verdict(q, s, y)
+        assert verdict == "inconclusive - both Stute and Yatchew linearity tests NaN"
+
+    def test_yatchew_nan_alone_does_not_force_inconclusive(self):
+        """CRITICAL R4 P3 fix: the paper says step 3 uses "Stute OR
+        Yatchew". A conclusive Stute must be sufficient even when Yatchew
+        returns NaN (e.g. tied-dose panels). Verdict should be
+        fail-to-reject with a "(Yatchew NaN - skipped)" suffix."""
+        q = _mk_qug(reject=False)
+        s = _mk_stute(reject=False)
+        y = _mk_yatchew(reject=False, p=float("nan"))
+        verdict = _compose_verdict(q, s, y)
+        assert "inconclusive" not in verdict
+        assert verdict.startswith("QUG and linearity diagnostics fail-to-reject")
+        assert "Yatchew NaN - skipped" in verdict
+        assert "Assumption 7" in verdict
+
+    def test_stute_nan_alone_does_not_force_inconclusive(self):
+        """Mirror of the above: Yatchew conclusive + Stute NaN should
+        still adjudicate the linearity step via Yatchew alone."""
+        q = _mk_qug(reject=False)
+        s = _mk_stute(reject=False, p=float("nan"))
+        y = _mk_yatchew(reject=False)
+        verdict = _compose_verdict(q, s, y)
+        assert "inconclusive" not in verdict
+        assert verdict.startswith("QUG and linearity diagnostics fail-to-reject")
+        assert "Stute NaN - skipped" in verdict
+
+    def test_qug_reject_with_both_linearity_nan_surfaces_rejection(self):
+        """R6 P1 fix: a conclusive QUG rejection must NOT be hidden by
+        "inconclusive" just because BOTH linearity tests are NaN. Paper
+        rule is one-way: TWFE is admissible only if NO test rejects, so
+        any conclusive rejection dominates unresolved-step notes.
+        """
+        q = _mk_qug(reject=True)
+        s = _mk_stute(reject=False, p=float("nan"))
+        y = _mk_yatchew(reject=False, p=float("nan"))
+        verdict = _compose_verdict(q, s, y)
+        # Rejection must be surfaced as the primary verdict.
+        assert "support infimum rejected" in verdict
+        assert not verdict.startswith("inconclusive")
+        # Unresolved linearity step is appended, not replacing the rejection.
+        assert "additional steps unresolved" in verdict
+        assert "both Stute and Yatchew" in verdict
+
+    def test_linearity_reject_with_qug_nan_surfaces_rejection(self):
+        """R6 P1 fix: a conclusive linearity rejection (Stute or Yatchew)
+        must NOT be hidden by "inconclusive - QUG NaN"."""
+        q = _mk_qug(reject=False, p=float("nan"))
+        s = _mk_stute(reject=True)
+        y = _mk_yatchew(reject=False)
+        verdict = _compose_verdict(q, s, y)
+        assert "linearity rejected" in verdict
+        assert "Stute" in verdict
+        assert not verdict.startswith("inconclusive")
+        assert "additional steps unresolved" in verdict
+        assert "QUG NaN" in verdict
+
+    def test_all_three_reject_with_qug_nan_keeps_conclusive_rejections(self):
+        """Mixed case: Stute and Yatchew both conclusively reject, QUG
+        unresolved. All conclusive rejections must be reported."""
+        q = _mk_qug(reject=False, p=float("nan"))
+        s = _mk_stute(reject=True)
+        y = _mk_yatchew(reject=True)
+        verdict = _compose_verdict(q, s, y)
+        assert "linearity rejected" in verdict
+        assert "Stute" in verdict
+        assert "Yatchew" in verdict
+        assert "QUG NaN" in verdict
+        assert not verdict.startswith("inconclusive")
+
+    def test_none_reject_flags_assumption7_gap(self):
+        """(b) None reject -> verdict flags the Assumption 7 pre-trends gap
+        rather than claiming unconditional TWFE safety."""
+        q = _mk_qug(reject=False)
+        s = _mk_stute(reject=False)
+        y = _mk_yatchew(reject=False)
+        verdict = _compose_verdict(q, s, y)
+        assert "QUG and linearity diagnostics fail-to-reject" in verdict
+        assert "Assumption 7" in verdict
+        assert "pre-trends" in verdict
+        assert "paper step 2 deferred" in verdict
+        assert "TWFE safe" not in verdict
+
+    def test_only_qug_rejects(self):
+        """(c) Only QUG rejects -> QUG-only message."""
+        q = _mk_qug(reject=True)
+        s = _mk_stute(reject=False)
+        y = _mk_yatchew(reject=False)
+        verdict = _compose_verdict(q, s, y)
+        assert "support infimum rejected" in verdict
+        assert "linearity rejected" not in verdict
+
+    def test_only_linearity_rejects(self):
+        """(d) Only linearity rejects -> linearity-only message."""
+        q = _mk_qug(reject=False)
+        s = _mk_stute(reject=True)
+        y = _mk_yatchew(reject=False)
+        verdict = _compose_verdict(q, s, y)
+        assert "linearity rejected" in verdict
+        assert "Stute" in verdict
+        assert "Yatchew" not in verdict
+        assert "support infimum rejected" not in verdict
+
+    def test_all_reject_bundles_reasons(self):
+        """(e) All reject -> bundled message naming each."""
+        q = _mk_qug(reject=True)
+        s = _mk_stute(reject=True)
+        y = _mk_yatchew(reject=True)
+        verdict = _compose_verdict(q, s, y)
+        assert "support infimum rejected" in verdict
+        assert "linearity rejected" in verdict
+        assert "Stute" in verdict
+        assert "Yatchew" in verdict
+        assert "; " in verdict  # bundled with separator
+
+
+# =============================================================================
+# Top-level import surface
+# =============================================================================
+
+
+def test_phase_3_imports_at_top_level():
+    """Commit criterion 19: all Phase 3 names importable from `diff_diff`."""
+    import diff_diff
+
+    expected = [
+        "HADPretestReport",
+        "QUGTestResults",
+        "StuteTestResults",
+        "YatchewTestResults",
+        "did_had_pretest_workflow",
+        "qug_test",
+        "stute_test",
+        "yatchew_hr_test",
+    ]
+    for name in expected:
+        assert hasattr(diff_diff, name), f"diff_diff.{name} missing from public API"
+        assert name in diff_diff.__all__, f"{name} missing from diff_diff.__all__"
+
+
+# =============================================================================
+# summary() smoke
+# =============================================================================
+
+
+class TestSummary:
+    """summary() / print_summary() produce non-empty strings for all classes."""
+
+    def test_qug_summary_non_empty(self):
+        r = qug_test(np.array([0.2, 0.5, 0.9]))
+        s = r.summary()
+        assert len(s) > 0
+        assert "QUG" in s
+
+    def test_stute_summary_non_empty(self):
+        d, dy = _linear_dgp(G=50)
+        r = stute_test(d, dy, n_bootstrap=199, seed=42)
+        s = r.summary()
+        assert len(s) > 0
+        assert "Stute" in s or "CvM" in s
+
+    def test_yatchew_summary_non_empty(self):
+        d, dy = _linear_dgp(G=50)
+        r = yatchew_hr_test(d, dy)
+        s = r.summary()
+        assert len(s) > 0
+        assert "Yatchew" in s
+
+    def test_report_summary_bundles_all(self):
+        d, dy = _linear_dgp(G=50)
+        panel = _make_two_period_panel(50, d, dy, seed=42)
+        report = did_had_pretest_workflow(panel, "y", "d", "time", "unit", n_bootstrap=199, seed=42)
+        s = report.summary()
+        assert "QUG" in s
+        assert "Stute" in s or "CvM" in s
+        assert "Yatchew" in s
+        assert "Verdict:" in s