diff --git a/CHANGELOG.md b/CHANGELOG.md
index ca42006f..a84732c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 - Add Zenodo DOI badge to README; upgrade the BibTeX citation block with the concept DOI (`10.5281/zenodo.19646175`) and list author as Isaac Gerber (matching `CITATION.cff`). Add `doi:` and `identifiers:` entries (concept + versioned) to `CITATION.cff`. DOI was minted by Zenodo when v3.1.3 was released.
+- **`ChaisemartinDHaultfoeuille` heterogeneity + within-group-varying PSU/strata now supported under Binder TSL** - `fit(heterogeneity=..., survey_design=...)` no longer raises `NotImplementedError` when the resolved design's PSU or strata vary across the cells of a group. On the **Binder TSL** branch (`compute_survey_if_variance`), the heterogeneity WLS coefficient IF is expanded to observation level via the cell-period allocator `ψ_i = ψ_g * (w_i / W_{g, out_idx})` on the post-period cell — the DID_l post-period single-cell convention shipped in v3.1.x. Under PSU=group the PSU-level Binder TSL variance is byte-identical to the previous release (PSU-level aggregate telescopes to `ψ_g`); under within-group-varying PSU, mass lands in the post-period PSU of the transition. The **Rao-Wu replicate-weight** branch (`compute_replicate_if_variance`) retains the legacy group-level allocator `ψ_i = ψ_g * (w_i / W_g)`: replicate variance computes `θ_r = sum_i ratio_ir * ψ_i` at observation level and is therefore not PSU-telescoping, so the cell-period allocator would silently change the replicate SE whenever a replicate column's ratios vary within group (e.g., per-row replicate matrices). Replicate + heterogeneity fits therefore produce byte-identical SE to the previous release, and the newly-unblocked `heterogeneity=` + within-group-varying PSU combination is unreachable under replicate designs by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). `n_bootstrap > 0` combined with within-group-varying PSU remains gated with `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map and will be extended in a follow-up PR.
 
 ## [3.1.3] - 2026-04-18
 
diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py
index a8562a33..846c9506 100644
--- a/diff_diff/chaisemartin_dhaultfoeuille.py
+++ b/diff_diff/chaisemartin_dhaultfoeuille.py
@@ -665,10 +665,7 @@ def fit(
             clustered bootstrap. **Out-of-scope combinations raise
             ``NotImplementedError``**: (a) replicate weights with
             ``n_bootstrap > 0`` (replicate variance is closed-form;
-            bootstrap would double-count variance); (b)
-            ``heterogeneity=`` with PSU/strata that vary within group
-            (heterogeneity WLS still uses the legacy group-level IF
-            expansion; follow-up PR extends it); (c) ``n_bootstrap >
+            bootstrap would double-count variance); (b) ``n_bootstrap >
             0`` with PSU that varies within group (PSU-level bootstrap
             still uses the legacy group-level PSU map; follow-up PR
             extends it). See REGISTRY.md
@@ -828,39 +825,25 @@ def fit(
 
             # Cell-period IF allocator contract: strata and PSU must be
             # constant within each (g, t) cell, a strict relaxation of
-            # the previous within-group constancy rule. Two out-of-scope
-            # combinations are gated with NotImplementedError until the
-            # corresponding follow-up PRs extend them:
-            #   - heterogeneity= + within-group-varying PSU/strata
-            #     (PR 3: cell-period allocator for the WLS psi_obs)
+            # the previous within-group constancy rule. One out-of-scope
+            # combination remains gated with NotImplementedError until
+            # the corresponding follow-up PR extends it:
             #   - n_bootstrap > 0 + within-group-varying PSU
             #     (PR 4: cell-level Hall-Mammen wild bootstrap)
-            strata_varies, psu_varies = _strata_psu_vary_within_group(
+            _, psu_varies = _strata_psu_vary_within_group(
                 resolved_survey, data, group, survey_weights,
             )
-            if strata_varies or psu_varies:
-                if heterogeneity is not None:
-                    raise NotImplementedError(
-                        "heterogeneity= is not supported under a survey "
-                        "design whose PSU or strata vary within group. "
-                        "The heterogeneity WLS path uses the legacy "
-                        "group-level IF expansion and will be extended "
-                        "to the cell-period allocator in a follow-up "
-                        "PR. For now, either (a) set heterogeneity=None, "
-                        "or (b) collapse PSU/strata to be constant "
-                        "within each group."
-                    )
-                if self.n_bootstrap > 0:
-                    raise NotImplementedError(
-                        "n_bootstrap > 0 is not supported under a "
-                        "survey design whose PSU varies within group. "
-                        "The PSU-level Hall-Mammen wild bootstrap uses "
-                        "the legacy group-level PSU map and will be "
-                        "extended to cell-level PSU in a follow-up PR. "
-                        "For now, use n_bootstrap=0 (analytical TSL "
-                        "variance, which fully supports within-group-"
-                        "varying PSU via the cell-period allocator)."
-                    )
+            if psu_varies and self.n_bootstrap > 0:
+                raise NotImplementedError(
+                    "n_bootstrap > 0 is not supported under a "
+                    "survey design whose PSU varies within group. "
+                    "The PSU-level Hall-Mammen wild bootstrap uses "
+                    "the legacy group-level PSU map and will be "
+                    "extended to cell-level PSU in a follow-up PR. "
+                    "For now, use n_bootstrap=0 (analytical TSL "
+                    "variance, which fully supports within-group-"
+                    "varying PSU via the cell-period allocator)."
+                )
             _validate_cell_constant_strata_psu(
                 resolved_survey, data, group, time, survey_weights,
             )
@@ -3734,15 +3717,45 @@ def _compute_heterogeneity_test(
         Required when ``obs_survey_info`` is supplied.
     obs_survey_info : dict, optional
         Observation-level survey info with keys ``group_ids`` (raw per-row
-        group labels), ``weights`` (per-row survey weights), and ``resolved``
-        (ResolvedSurveyDesign). When provided, the regression uses WLS with
-        per-group weights W_g = sum of obs survey weights. SE is computed
-        via Binder TSL IF expansion through ``compute_survey_if_variance``
-        by default; under a replicate-weight design (BRR/Fay/JK1/JKn/SDR),
-        dispatches to ``compute_replicate_if_variance`` for Rao-Wu-style
-        variance. The effective df for t-critical values follows the
-        site-level ``min(df_s, n_valid_het - 1)`` rule and the helper
-        mutates ``replicate_n_valid_list`` so the final
+        group labels), ``time_ids`` (raw per-row period labels),
+        ``weights`` (per-row survey weights), ``resolved``
+        (ResolvedSurveyDesign), and ``periods`` (sorted canonical period
+        array matching ``Y_mat``'s column order). When provided, the
+        regression uses WLS with per-group weights
+        ``W_g = sum of obs survey weights in group g``. The group-level
+        WLS coefficient IF is
+        ``ψ_g = inv(X'WX)[1,:] @ x_g * W_g * r_g``. Two observation-level
+        expansions of ``ψ_g`` coexist on this path, split by variance
+        helper so each path uses the allocator that preserves
+        byte-identity for its aggregation rule:
+
+        * **Binder TSL** (``compute_survey_if_variance``): the
+          cell-period single-cell allocator —
+          ``ψ_i = ψ_g * (w_i / W_{g, out_idx})`` for obs in
+          ``(g, out_idx)``, zero elsewhere. Under PSU=group per-obs
+          distribution differs from the legacy
+          ``ψ_i = ψ_g * (w_i / W_g)`` but PSU-level aggregates
+          telescope to the same ``ψ_g``, so Binder variance is
+          byte-identical to the pre-cell-period release. Under
+          within-group-varying PSU mass lands in the post-period PSU
+          of the transition (DID_l post-period convention).
+        * **Rao-Wu replicate** (``compute_replicate_if_variance``):
+          the legacy group-level allocator ``ψ_i = ψ_g * (w_i / W_g)``.
+          Replicate variance computes ``θ_r = sum_i ratio_ir * ψ_i``
+          at observation level, so moving ψ_g mass onto the
+          post-period cell would silently change the replicate SE
+          whenever a replicate column's ratios vary within a group
+          (which the library allows — e.g., per-row BRR/Fay/SDR
+          matrices). Keeping the legacy allocator on this branch
+          preserves byte-identity of replicate SE across every
+          previously-supported fit. Replicate + within-group-varying
+          PSU is unreachable by construction (``SurveyDesign``
+          rejects ``replicate_weights`` combined with explicit
+          ``strata/psu/fpc``).
+
+        The effective df for t-critical values follows the site-level
+        ``min(df_s, n_valid_het - 1)`` rule and the helper mutates
+        ``replicate_n_valid_list`` so the final
         ``_effective_df_survey(...)`` sees this site's n_valid.
     replicate_n_valid_list : list[int], optional
         Shared accumulator for replicate-weight ``n_valid`` counts across
@@ -3931,25 +3944,46 @@ def _compute_heterogeneity_test(
                 XtWX_inv = np.linalg.pinv(XtWX)
                 psi_g = (XtWX_inv[1, :] @ design.T) * W_elig * r_g  # (n_eligible,)
 
-            # Expand to obs level: ψ_i = ψ_g * (w_i / W_g) for i in group g.
-            psi_obs = np.zeros(len(obs_w_raw))
-            for e_idx, g_idx in enumerate(eligible):
-                gid = gid_list[g_idx]
-                mask_g = (obs_gids_raw == gid) & valid
-                w_sum_g = obs_w_raw[mask_g].sum()
-                if w_sum_g > 0:
-                    psi_obs[mask_g] = psi_g[e_idx] * (
-                        obs_w_raw[mask_g] / w_sum_g
-                    )
-
-            # Dispatch: replicate-weight variance (BRR/Fay/JK1/JKn/SDR)
-            # vs Binder TSL across stratified PSUs. Mirrors the inline
-            # branch in _survey_se_from_group_if and the pattern in
-            # TripleDifference:1206-1238. Heterogeneity uses WLS with
-            # full-sample weights; theta_hat is treated as fixed per the
-            # FWL plug-in IF convention (REGISTRY.md Note on heterogeneity
-            # under replicate — no per-replicate refits).
+            # Allocator dispatch. Two observation-level expansions of
+            # ψ_g coexist on this path, split by variance helper:
+            #
+            #   * Binder TSL (compute_survey_if_variance): cell-period
+            #     single-cell allocator —
+            #       ψ_i = ψ_g * (w_i / W_{g, out_idx})
+            #     for obs in (g, out_idx), zero elsewhere. Under
+            #     PSU=group, per-obs distribution differs from the
+            #     legacy ψ_i = ψ_g * (w_i / W_g) but PSU-level
+            #     aggregates telescope to ψ_g, so Binder variance is
+            #     byte-identical. Under within-group-varying PSU, mass
+            #     lands in the post-period PSU of the transition, which
+            #     is what Binder needs. DID_l single-cell convention —
+            #     see REGISTRY.md ChaisemartinDHaultfoeuille survey IF
+            #     expansion Note.
+            #
+            #   * Rao-Wu replicate (compute_replicate_if_variance):
+            #     legacy group-level allocator —
+            #       ψ_i = ψ_g * (w_i / W_g)
+            #     for obs in group g. Replicate variance computes
+            #     θ_r = sum_i ratio_ir * ψ_i at observation level, so
+            #     moving ψ_g onto the post-period cell only would
+            #     silently change the replicate SE whenever a
+            #     replicate column's ratios vary within group (e.g.,
+            #     the per-row replicate matrices this library
+            #     accepts). The group-level allocator preserves
+            #     byte-identity for all replicate usages under
+            #     PSU=group. The replicate + within-group-varying
+            #     PSU case is not reachable (SurveyDesign rejects
+            #     replicate_weights combined with explicit psu).
             if getattr(resolved, "uses_replicate_variance", False):
+                psi_obs = np.zeros(len(obs_w_raw), dtype=np.float64)
+                for e_idx, g_idx in enumerate(eligible):
+                    gid = gid_list[g_idx]
+                    mask_g = (obs_gids_raw == gid) & valid
+                    w_sum_g = obs_w_raw[mask_g].sum()
+                    if w_sum_g > 0:
+                        psi_obs[mask_g] = psi_g[e_idx] * (
+                            obs_w_raw[mask_g] / w_sum_g
+                        )
                 var_s, n_valid_het = compute_replicate_if_variance(
                     psi_obs, resolved
                 )
@@ -3968,6 +4002,23 @@ def _compute_heterogeneity_test(
                 else:
                     df_s_local = min(int(df_s), int(n_valid_het) - 1)
             else:
+                obs_tids = np.asarray(obs_survey_info["time_ids"])
+                periods_arr = np.asarray(obs_survey_info["periods"])
+                psi_obs = np.zeros(len(obs_w_raw), dtype=np.float64)
+                for e_idx, g_idx in enumerate(eligible):
+                    gid = gid_list[g_idx]
+                    out_idx = first_switch_idx[g_idx] - 1 + l_h
+                    t_val_out = periods_arr[out_idx]
+                    mask_cell = (
+                        (obs_gids_raw == gid)
+                        & (obs_tids == t_val_out)
+                        & valid
+                    )
+                    w_cell = obs_w_raw[mask_cell].sum()
+                    if w_cell > 0:
+                        psi_obs[mask_cell] = psi_g[e_idx] * (
+                            obs_w_raw[mask_cell] / w_cell
+                        )
                 var_s = compute_survey_if_variance(psi_obs, resolved)
                 df_s_local = df_s
             se_het = (
@@ -5413,12 +5464,14 @@ def _strata_psu_vary_within_group(
 ) -> Tuple[bool, bool]:
     """Return (strata_varies_within_group, psu_varies_within_group).
 
-    Diagnostic helper used to gate out-of-scope combinations for the
-    cell-period IF allocator — heterogeneity and ``n_bootstrap > 0``
-    currently require within-group constancy because they read
-    ``obs_survey_info`` through the legacy group-level expansion path.
-    PR 3 and PR 4 will extend them. Zero-weight rows are excluded from
-    the check (subpopulation contract).
+    Diagnostic helper used at ``fit()`` time to gate the remaining
+    out-of-scope combination for the cell-period IF allocator:
+    ``n_bootstrap > 0`` still uses a group-level PSU map and raises
+    ``NotImplementedError`` when PSU varies within group. The
+    heterogeneity WLS path supports within-group-varying PSU/strata
+    via the cell-period allocator (shipped in the PR that lifted the
+    previous gate). Zero-weight rows are excluded from the check
+    (subpopulation contract).
     """
     if resolved is None:
         return False, False
@@ -5766,10 +5819,12 @@ def _survey_se_from_group_if(
     else:
         # Legacy group-level allocator (no per-period attribution
         # provided, or time/period info unavailable). Preserved for
-        # paths that haven't threaded per-period attribution through
-        # yet (e.g., the heterogeneity psi_obs construction in
-        # _compute_heterogeneity_test — gated to within-group-constant
-        # PSU in Stage 2 per PR 2 scope).
+        # defensive fallback and for unit tests that exercise the
+        # legacy allocator. No current caller in fit() uses this
+        # branch — ATT / joiners / leavers / placebos all thread
+        # U_centered_per_period, and heterogeneity (as of PR 3)
+        # constructs its own cell-period psi_obs and calls
+        # compute_survey_if_variance directly.
         group_to_u = {gid: U_centered[idx] for idx, gid in enumerate(eligible_groups)}
         u_obs_eff = np.array([group_to_u.get(gid, 0.0) for gid in gids_eff])
         unique_gids, inverse = np.unique(gids_eff, return_inverse=True)
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 31ba4f72..b501a87b 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 
 - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`.
 
-- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. **Scope note (cell-period allocator):** Heterogeneity still uses the legacy group-level IF expansion `ψ_i = ψ_g * (w_i / W_g)`. Combining `heterogeneity=` with a survey design whose PSU or strata vary within group raises `NotImplementedError`; the cell-period extension of the heterogeneity path is deferred to a follow-up PR. Within-group-constant PSU/strata (including the auto-injected `psu=group`) is fully supported.
+- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the group-level WLS coefficient influence function is `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`. The group-level IF is then attributed to observation level via **one of two allocators, chosen by variance helper** so each path preserves byte-identity for its aggregation rule: (1) **Binder TSL** (`compute_survey_if_variance`) uses the **cell-period single-cell allocator** — at each horizon `l_h`, `ψ_g` is assigned in full to the post-period cell `(g, out_idx)` with `out_idx = first_switch_idx[g] - 1 + l_h` and expanded as `ψ_i = ψ_g * (w_i / W_{g, out_idx})` for obs in that cell, zero elsewhere (matches the DID_l post-period convention in the Survey IF expansion Note below). Under PSU=group per-observation distribution differs from the legacy `ψ_i = ψ_g * (w_i / W_g)`, but PSU-level aggregates telescope to the same `ψ_g` — so Binder TSL variance is byte-identical to the pre-cell-period release under PSU=group. Under within-group-varying PSU mass lands in the post-period PSU of the transition, which is what Binder TSL needs. An **empty post-period cell under zero-weight obs** (all obs at `(g, out_idx)` have `w_i = 0` despite `N > 0`) drops the group's contribution, matching the ATT cell allocator's convention; the pre-cell-period path diverged here by redistributing mass to other cells of the group. (2) **Rao-Wu replicate** (`compute_replicate_if_variance`) uses the **legacy group-level allocator** `ψ_i = ψ_g * (w_i / W_g)`. Replicate variance computes `θ_r = sum_i ratio_ir * ψ_i` at the observation level, so moving `ψ_g` mass onto the post-period cell only would silently change the replicate SE whenever a replicate column's ratios vary within group (the library accepts arbitrary per-row replicate matrices, not just PSU-aligned ones). Keeping the legacy allocator on this branch preserves byte-identity of replicate SE across every previously-supported fit; replicate + within-group-varying PSU is unreachable by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. **Scope note (bootstrap):** Combining `heterogeneity=` with `n_bootstrap > 0` and within-group-varying PSU still raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap uses a group-level PSU map until the follow-up PR extends it.
 
 - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning.
 
@@ -649,7 +649,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param
 - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6)
 - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface
 - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`.
-- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=<group column>` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require either an explicit `psu=<col>` or the original `SurveyDesign(..., nest=True)` flag** — under `nest=True` the resolver combines `(stratum, psu)` into globally-unique labels, so the auto-injected `psu=<group>` is re-labeled per stratum and the cell allocator proceeds. Only the `nest=False` + varying-strata + omitted-psu combination is rejected up front with a targeted `ValueError` at `fit()` time (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4.
+- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=<group column>` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require either an explicit `psu=<col>` or the original `SurveyDesign(..., nest=True)` flag** — under `nest=True` the resolver combines `(stratum, psu)` into globally-unique labels, so the auto-injected `psu=<group>` is re-labeled per stratum and the cell allocator proceeds. Only the `nest=False` + varying-strata + omitted-psu combination is rejected up front with a targeted `ValueError` at `fit()` time (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Post-period attribution extends to heterogeneity (Binder TSL branch only):** the heterogeneity WLS coefficient IF `ψ_g = inv(X'WX)[1,:] @ x_g * W_g * r_g` is attributed in full to the single post-period cell `(g, out_idx)` at each horizon (same single-cell convention as DID_l), then expanded as `ψ_i = ψ_g * (w_i / W_{g, out_idx})`, and fed through `compute_survey_if_variance`. Under PSU=group the PSU-level aggregate telescopes to `ψ_g`, so Binder variance is byte-identical relative to the pre-cell-period release; under within-group-varying PSU mass lands in the post-period PSU. **Replicate-weight branch keeps the legacy group-level allocator** `ψ_i = ψ_g * (w_i / W_g)` because `compute_replicate_if_variance` computes `θ_r = sum_i ratio_ir * ψ_i` at observation level and is therefore not PSU-telescoping: redistributing mass onto the post-period cell would silently change the replicate SE whenever a replicate column's ratios vary within a group (the library accepts arbitrary per-row replicate matrices, not just PSU-aligned ones). The legacy allocator preserves byte-identity of the replicate SE for every previously-supported fit. Replicate + within-group-varying PSU is unreachable by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). **Scope limitation (follow-up PR):** `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in a follow-up PR.
 - **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Scope note (cell-period allocator):** The PSU-level bootstrap uses a group-level `group_id_to_psu_code` map and therefore requires PSU to be constant within each group. Combining `n_bootstrap > 0` with a PSU that varies within group raises `NotImplementedError`; the cell-level Hall-Mammen extension is deferred to a follow-up PR. The analytical TSL variance fully supports within-group-varying PSU via the cell-period allocator — use `n_bootstrap=0` for those designs. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design.
 
 ---
diff --git a/tests/test_dcdh_heterogeneity_cell_period_coverage.py b/tests/test_dcdh_heterogeneity_cell_period_coverage.py
new file mode 100644
index 00000000..f4012572
--- /dev/null
+++ b/tests/test_dcdh_heterogeneity_cell_period_coverage.py
@@ -0,0 +1,158 @@
+"""Monte Carlo coverage simulation for the cell-period IF allocator
+applied to the heterogeneity WLS path (PR 3).
+
+Validates empirical null coverage under a DGP with PSU that varies
+across cells of the same group — the regime unlocked by PR 3's lift
+of the heterogeneity gate. Under within-group-constant PSU the cell
+allocator's PSU-level aggregate matches the previous group allocator
+byte-for-byte (see `TestHeterogeneityCellPeriod::
+test_psu_level_byte_identity_under_psu_equals_group` in
+`test_survey_dcdh.py`), so the coverage check only exercises the new
+within-group-varying-PSU code path.
+
+Null-coverage rationale:
+
+    beta_het_true = 0 is deliberate. The heterogeneity test's
+    identifying estimand under Lemma 7 is a variance-weighted average
+    of effect differences, so a non-zero DGP-level beta does not have
+    a clean "true value" for coverage. A constant-treatment-effect DGP
+    ensures beta_het_true = 0 exactly, so the CI must cover 0 at
+    nominal 95% regardless of Lemma 7's variance weighting.
+
+Marked ``slow`` and excluded from the default pytest run. To execute:
+
+    pytest tests/test_dcdh_heterogeneity_cell_period_coverage.py -m slow -v
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from diff_diff.chaisemartin_dhaultfoeuille import ChaisemartinDHaultfoeuille
+from diff_diff.survey import SurveyDesign
+
+
+def _simulate_panel(
+    n_groups: int,
+    n_periods: int,
+    first_treated_period: int,
+    tau: float,
+    rng: np.random.Generator,
+    psu_sigma: float = 0.5,
+    obs_sigma: float = 1.0,
+    group_sigma: float = 1.0,
+) -> pd.DataFrame:
+    """Generate a one-obs-per-cell panel with within-group-varying PSU
+    and a group-level time-invariant covariate X_het.
+
+    Treatment effect ``tau`` is constant (independent of ``X_het``) so
+    the true heterogeneity coefficient on X_het is exactly 0 under
+    Lemma 7 — the null-coverage target for this test.
+    """
+    groups = np.arange(n_groups)
+    treated = groups < n_groups // 2
+    group_fe = rng.normal(0.0, group_sigma, size=n_groups)
+    # (group, psu) effect — constant across time within the (g, psu)
+    # pair. Drives within-group residual correlation that varies by
+    # cell, so design-based variance must capture it per-PSU.
+    psu_fe = rng.normal(0.0, psu_sigma, size=(n_groups, 2))
+    # Group-level binary covariate (time-invariant).
+    x_het = rng.binomial(1, 0.5, size=n_groups).astype(float)
+
+    rows = []
+    for g in groups:
+        for t in range(n_periods):
+            parity = int(t % 2)
+            # PSU labels are globally unique per (group, parity) so
+            # Binder variance sees 2 * n_groups sampling clusters
+            # rather than 2 global PSU codes reused across groups.
+            # Within-cell constancy (one obs per cell) is trivially
+            # satisfied.
+            psu_id = int(g) * 2 + parity
+            d = 1 if (treated[g] and t >= first_treated_period) else 0
+            y = group_fe[g] + 0.1 * t + tau * d + psu_fe[g, parity] + rng.normal(0.0, obs_sigma)
+            rows.append(
+                {
+                    "group": int(g),
+                    "period": int(t),
+                    "treatment": int(d),
+                    "outcome": float(y),
+                    "psu": psu_id,
+                    "pw": 1.0,
+                    "x_het": float(x_het[g]),
+                }
+            )
+    return pd.DataFrame(rows)
+
+
+@pytest.mark.slow
+def test_heterogeneity_cell_period_null_coverage_varying_psu():
+    """Empirical 95% null coverage for beta_het under the cell-period
+    allocator on a DGP with within-group-varying PSU. Tolerance ±2.5pp
+    mirrors the ATT coverage test in test_dcdh_cell_period_coverage.py
+    (about 2 MC-sigma at n_reps=500 for a true 95% target).
+    """
+    n_reps = 500
+    n_groups = 40
+    n_periods = 6
+    first_treated_period = 3
+    tau_true = 2.0
+
+    rng = np.random.default_rng(20260419)
+    covered = 0
+    failed = 0
+
+    for r in range(n_reps):
+        df = _simulate_panel(
+            n_groups=n_groups,
+            n_periods=n_periods,
+            first_treated_period=first_treated_period,
+            tau=tau_true,
+            rng=rng,
+        )
+        sd = SurveyDesign(weights="pw", psu="psu")
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                res = ChaisemartinDHaultfoeuille(seed=r + 1).fit(
+                    df,
+                    outcome="outcome",
+                    group="group",
+                    time="period",
+                    treatment="treatment",
+                    heterogeneity="x_het",
+                    survey_design=sd,
+                    L_max=1,
+                )
+        except Exception:
+            failed += 1
+            continue
+
+        if res.heterogeneity_effects is None:
+            failed += 1
+            continue
+        entry = res.heterogeneity_effects[1]
+        ci = entry["conf_int"]
+        if ci is None or not all(np.isfinite(ci)):
+            failed += 1
+            continue
+        lo, hi = float(ci[0]), float(ci[1])
+        # Null coverage: beta_het_true = 0 since tau is constant.
+        if lo <= 0.0 <= hi:
+            covered += 1
+
+    completed = n_reps - failed
+    assert completed >= int(0.95 * n_reps), (
+        f"MC simulation had {failed}/{n_reps} fit failures, above " f"the 5% tolerance."
+    )
+    coverage = covered / completed
+    assert 0.925 <= coverage <= 0.975, (
+        f"Empirical null coverage {coverage:.3f} (completed "
+        f"{completed}) outside [0.925, 0.975]; tau_true={tau_true}, "
+        f"beta_het_true=0, n_groups={n_groups}, n_periods={n_periods}, "
+        f"n_reps={n_reps}."
+    )
diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py
index e400c3eb..4ac5472f 100644
--- a/tests/test_survey_dcdh.py
+++ b/tests/test_survey_dcdh.py
@@ -1075,9 +1075,10 @@ class TestSurveyWithinGroupValidation:
     """Cell-period IF allocator contract: strata and PSU may vary ACROSS
     cells of a group, but must be constant WITHIN each (g, t) cell. In
     canonical one-obs-per-cell panels the cell-level constancy check is
-    trivially satisfied. Out-of-scope combinations (heterogeneity +
-    within-group-varying PSU; n_bootstrap > 0 + within-group-varying
-    PSU) raise NotImplementedError with a pointer to the follow-up PR.
+    trivially satisfied. Heterogeneity + within-group-varying PSU is
+    supported via the cell-period allocator. The remaining out-of-scope
+    combination is n_bootstrap > 0 + within-group-varying PSU, which
+    still raises NotImplementedError with a pointer to the follow-up PR.
     """
 
     def test_accepts_varying_psu_within_group(self, base_data):
@@ -1138,23 +1139,35 @@ def test_accepts_varying_strata_within_group(self, base_data):
         assert np.isfinite(result.overall_att)
         assert np.isfinite(result.overall_se)
 
-    def test_heterogeneity_with_varying_psu_raises(self, base_data):
-        """heterogeneity= is gated under within-group-varying PSU until
-        PR 3 ships the cell-period allocator for the WLS psi_obs path.
+    def test_heterogeneity_with_varying_psu_succeeds(self, base_data):
+        """heterogeneity= is supported under within-group-varying PSU
+        via the cell-period allocator: psi_g is attributed in full to
+        the (g, out_idx) post-period cell and expanded to obs level as
+        psi_i = psi_g * (w_i / W_{g, out_idx}). All five inference
+        fields must be finite when the survey design is regular.
         """
         df_ = base_data.copy()
         df_["pw"] = 1.0
-        df_["stratum"] = 0
-        df_["psu"] = df_["period"] % 2  # varies within group
-        df_["x_het"] = np.arange(len(df_)) % 3  # categorical covariate
-        sd = SurveyDesign(weights="pw", strata="stratum", psu="psu")
-        with pytest.raises(NotImplementedError, match="heterogeneity"):
-            ChaisemartinDHaultfoeuille(seed=1).fit(
-                df_, outcome="outcome", group="group",
-                time="period", treatment="treatment",
-                heterogeneity="x_het", L_max=1,
-                survey_design=sd,
-            )
+        # Make x_het time-invariant within each group (heterogeneity
+        # test requires a group-level covariate).
+        df_["x_het"] = (df_["group"].astype(int) % 2).astype(float)
+        # Per-group PSU parity — unique per (group, parity), varies
+        # within group so the cell-period allocator is exercised.
+        df_["psu"] = df_["group"].astype(int) * 2 + (df_["period"].astype(int) % 2)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_, outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            heterogeneity="x_het", L_max=1,
+            survey_design=sd,
+        )
+        assert res.heterogeneity_effects is not None
+        entry = res.heterogeneity_effects[1]
+        assert np.isfinite(entry["beta"])
+        assert np.isfinite(entry["se"]) and entry["se"] >= 0.0
+        assert np.isfinite(entry["t_stat"])
+        assert np.isfinite(entry["p_value"])
+        assert all(np.isfinite(entry["conf_int"]))
 
     def test_bootstrap_with_varying_psu_raises(self, base_data):
         """n_bootstrap > 0 is gated under within-group-varying PSU until
@@ -1212,6 +1225,59 @@ def test_auto_inject_with_varying_strata_nest_true_succeeds(self, base_data):
             == r_explicit.survey_metadata.df_survey
         )
 
+    def test_heterogeneity_auto_inject_with_varying_strata_nest_true_succeeds(self, base_data):
+        """PR 3 unblocks heterogeneity + SurveyDesign(strata, nest=True)
+        with auto-inject psu=group. Under nest=True the resolver
+        combines (stratum, psu) into globally-unique labels, so
+        resolved.psu varies across cells of each group and the
+        cell-period allocator handles it. Mirrors ATT coverage at
+        test_auto_inject_with_varying_strata_nest_true_succeeds.
+        """
+        df_ = base_data.copy()
+        df_["pw"] = 1.0
+        df_["stratum"] = df_["period"] % 2
+        df_["x_het"] = (df_["group"].astype(int) % 2).astype(float)
+        sd = SurveyDesign(weights="pw", strata="stratum", nest=True)
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_, outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            heterogeneity="x_het", L_max=1,
+            survey_design=sd,
+        )
+        assert res.heterogeneity_effects is not None
+        entry = res.heterogeneity_effects[1]
+        assert np.isfinite(entry["beta"])
+        assert np.isfinite(entry["se"]) and entry["se"] >= 0.0
+        assert np.isfinite(entry["t_stat"])
+        assert np.isfinite(entry["p_value"])
+        assert all(np.isfinite(entry["conf_int"]))
+
+    def test_heterogeneity_multi_horizon_varying_psu_succeeds(self, base_data):
+        """Multi-horizon heterogeneity (L_max >= 2) + within-group-
+        varying PSU — each horizon rebuilds its own (g, out_idx) cell
+        mapping, so the per-horizon allocator must produce finite
+        inference independently at every horizon.
+        """
+        df_ = base_data.copy()
+        df_["pw"] = 1.0
+        df_["x_het"] = (df_["group"].astype(int) % 2).astype(float)
+        df_["psu"] = df_["group"].astype(int) * 2 + (df_["period"].astype(int) % 2)
+        sd = SurveyDesign(weights="pw", psu="psu")
+        res = ChaisemartinDHaultfoeuille(seed=1).fit(
+            df_, outcome="outcome", group="group",
+            time="period", treatment="treatment",
+            heterogeneity="x_het", L_max=2,
+            survey_design=sd,
+        )
+        assert res.heterogeneity_effects is not None
+        for horizon in (1, 2):
+            entry = res.heterogeneity_effects[horizon]
+            assert np.isfinite(entry["beta"]), f"beta NaN at horizon {horizon}"
+            assert np.isfinite(entry["se"]) and entry["se"] >= 0.0
+            assert np.isfinite(entry["t_stat"])
+            assert np.isfinite(entry["p_value"])
+            assert all(np.isfinite(entry["conf_int"]))
+
     def test_auto_inject_with_varying_strata_raises(self, base_data):
         """Auto-injected `psu=<group>` with nest=False cannot honor
         strata that vary across cells of a group — the synthesized PSU
@@ -1543,3 +1609,173 @@ def test_within_cell_check_excludes_zero_weight_rows(self, base_data):
             survey_design=sd,
         )
         assert np.isfinite(result.overall_att)
+
+
+class TestHeterogeneityCellPeriod:
+    """Unit tests for the heterogeneity cell-period allocator invariants.
+
+    Under PSU=group the new single-cell psi_obs distribution
+    (mass in the (g, out_idx) post-period cell only, scaled by
+    w_i / W_{g, out_idx}) differs from the legacy group-level
+    distribution (mass everywhere in g, scaled by w_i / W_g) at the
+    observation level. But both telescope to the same PSU-level sum
+    psi_g because compute_survey_if_variance aggregates to PSU first.
+    Binder TSL variance must therefore be byte-identical.
+    """
+
+    def test_psu_level_byte_identity_under_psu_equals_group(self):
+        """Construct both legacy and new psi_obs on a tiny fixture
+        (PSU=group, one obs per cell), feed both through
+        compute_survey_if_variance, and assert variances equal
+        within ULP — the exact invariant the REGISTRY Note claims.
+        """
+        from diff_diff.survey import (
+            SurveyDesign,
+            compute_survey_if_variance,
+        )
+
+        # Fixture: 4 groups * 4 periods = 16 obs, one obs per cell,
+        # pw=1 everywhere. PSU=group so each group is a single PSU.
+        n_groups, n_periods = 4, 4
+        rows = []
+        for g in range(n_groups):
+            for t in range(n_periods):
+                rows.append({
+                    "group": int(g),
+                    "period": int(t),
+                    "pw": 1.0,
+                })
+        df_ = pd.DataFrame(rows)
+        sd = SurveyDesign(weights="pw", psu="group")
+        resolved = sd.resolve(df_)
+
+        # Arbitrary non-zero group-level IF values + per-group out_idx
+        # (the post-period cell chosen by the heterogeneity horizon).
+        psi_g = np.array([1.0, -0.5, 2.0, -1.5], dtype=np.float64)
+        out_idx_per_group = np.array([3, 3, 2, 2], dtype=np.int64)
+
+        obs_group = df_["group"].values.astype(np.int64)
+        obs_period = df_["period"].values.astype(np.int64)
+        w = df_["pw"].to_numpy(dtype=np.float64)
+
+        # Legacy expansion: psi_i = psi_g[g_i] * w_i / W_g.
+        W_g = np.zeros(n_groups, dtype=np.float64)
+        np.add.at(W_g, obs_group, w)
+        psi_legacy = np.zeros_like(w)
+        for i in range(len(w)):
+            if W_g[obs_group[i]] > 0:
+                psi_legacy[i] = psi_g[obs_group[i]] * (w[i] / W_g[obs_group[i]])
+
+        # New expansion: psi_i = psi_g[g_i] * w_i / W_{g, out_idx}
+        # for obs in (g, out_idx), zero elsewhere.
+        W_cell_out = np.zeros(n_groups, dtype=np.float64)
+        for i in range(len(w)):
+            if obs_period[i] == out_idx_per_group[obs_group[i]]:
+                W_cell_out[obs_group[i]] += w[i]
+        psi_new = np.zeros_like(w)
+        for i in range(len(w)):
+            g_i = obs_group[i]
+            if obs_period[i] == out_idx_per_group[g_i] and W_cell_out[g_i] > 0:
+                psi_new[i] = psi_g[g_i] * (w[i] / W_cell_out[g_i])
+
+        # Distributions differ at the obs level.
+        assert not np.allclose(psi_legacy, psi_new), (
+            "fixture should exercise the mass redistribution — "
+            "legacy spreads across all obs of g, new concentrates in "
+            "(g, out_idx)."
+        )
+
+        # PSU-level sums must match — this is the invariant the
+        # REGISTRY Note claims under PSU=group.
+        assert resolved.psu is not None
+        psu_codes = np.asarray(resolved.psu, dtype=np.int64)
+        psu_sum_legacy = np.zeros(n_groups, dtype=np.float64)
+        psu_sum_new = np.zeros(n_groups, dtype=np.float64)
+        np.add.at(psu_sum_legacy, psu_codes, psi_legacy)
+        np.add.at(psu_sum_new, psu_codes, psi_new)
+        assert np.allclose(psu_sum_legacy, psu_sum_new, atol=0.0, rtol=1e-15)
+
+        # Binder TSL variances must match byte-for-byte (single stratum,
+        # each PSU sum contributes equally in both paths).
+        var_legacy = compute_survey_if_variance(psi_legacy, resolved)
+        var_new = compute_survey_if_variance(psi_new, resolved)
+        assert np.isfinite(var_legacy) and np.isfinite(var_new)
+        assert var_legacy == pytest.approx(var_new, rel=1e-14, abs=1e-14)
+
+    def test_replicate_variance_non_invariance_under_varying_ratios(self):
+        """When replicate-weight ratios vary within group,
+        compute_replicate_if_variance is NOT PSU-telescoping — its
+        theta_r = sum_i ratio_ir * psi_i reads psi at observation
+        level, so redistributing mass across cells of g changes the
+        replicate variance. This is why the heterogeneity path keeps
+        the legacy group-level allocator on the replicate branch
+        (only Binder TSL uses the cell-period allocator). Regression
+        guard for the CI-review P1 on PR #329.
+        """
+        from diff_diff.survey import (
+            SurveyDesign,
+            compute_replicate_if_variance,
+        )
+
+        # Minimal reproduction of the reviewer's counterexample: one
+        # group with two obs (ref and post-period cells), replicate
+        # ratios that vary within the group. legacy allocator spreads
+        # psi_g across both obs; new cell-period allocator puts it
+        # all on the post-period cell. Replicate variance differs.
+        rows = []
+        for g in range(2):
+            for t in range(2):
+                rows.append({
+                    "group": int(g),
+                    "period": int(t),
+                    "pw": 1.0,
+                    # Non-PSU-aligned replicate columns: vary within
+                    # each group so sum_i ratio_ir * psi_i sees a
+                    # different weighted average of psi mass under
+                    # the two allocators.
+                    "rep0": 0.5 if t == 0 else 1.5,
+                    "rep1": 1.5 if t == 0 else 0.5,
+                })
+        df_ = pd.DataFrame(rows)
+        sd = SurveyDesign(
+            weights="pw",
+            replicate_weights=["rep0", "rep1"],
+            replicate_method="SDR",
+        )
+        resolved = sd.resolve(df_)
+
+        # Arbitrary non-zero psi_g per group; out_idx = 1 for each.
+        psi_g = np.array([0.75, -1.25], dtype=np.float64)
+        obs_group = df_["group"].values.astype(np.int64)
+        obs_period = df_["period"].values.astype(np.int64)
+        w = df_["pw"].to_numpy(dtype=np.float64)
+
+        # Legacy group-level expansion (what the replicate branch now
+        # uses inside _compute_heterogeneity_test after the PR-329 fix).
+        W_g = np.zeros(2, dtype=np.float64)
+        np.add.at(W_g, obs_group, w)
+        psi_legacy = np.zeros_like(w)
+        for i in range(len(w)):
+            if W_g[obs_group[i]] > 0:
+                psi_legacy[i] = psi_g[obs_group[i]] * (
+                    w[i] / W_g[obs_group[i]]
+                )
+
+        # New cell-period single-cell expansion (what the Binder TSL
+        # branch uses). All mass lands on obs at t == 1.
+        psi_new = np.zeros_like(w)
+        for i in range(len(w)):
+            if obs_period[i] == 1:
+                psi_new[i] = psi_g[obs_group[i]]  # W_{g,out_idx}=1
+
+        var_legacy, _ = compute_replicate_if_variance(psi_legacy, resolved)
+        var_new, _ = compute_replicate_if_variance(psi_new, resolved)
+        assert np.isfinite(var_legacy) and np.isfinite(var_new)
+        # Documented non-invariance: replicate variance differs
+        # materially between the two allocators on this fixture.
+        assert not np.isclose(var_legacy, var_new, rtol=1e-6), (
+            f"Expected legacy vs cell-period replicate variance to "
+            f"differ when replicate ratios vary within group "
+            f"(counterexample from PR #329 CI review). Got "
+            f"var_legacy={var_legacy}, var_new={var_new}."
+        )