From 225674cda82436b710b404d327ac16113890a965 Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 23 Apr 2026 20:26:21 -0400 Subject: [PATCH 1/2] Fix TROP bootstrap SE backend divergence under fixed seed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rust and Python TROP backends produced different bootstrap standard errors for the same `seed` value. On a tiny correlated panel under `seed=42` the gap was ~28% of SE: Rust seeded `rand_xoshiro:: Xoshiro256PlusPlus` per replicate while Python's fallback consumed `numpy.random.default_rng` (PCG64), so identical seeds mapped to different bytestreams. Canonicalize on numpy. New `stratified_bootstrap_indices` helper in `diff_diff/bootstrap_utils.py` pre-generates per-replicate (control, treated) positional index arrays from a numpy `Generator` and hands them to both backends through the PyO3 surface — both Rust bootstrap functions (`bootstrap_trop_variance_global`, `bootstrap_trop_variance`) now accept `control_indices` and `treated_indices` as `i64` arrays in place of `seed: u64`. Parallelism is preserved. Sampling law (stratified: controls then treated, with replacement) is unchanged. Global-method SE is now backend-invariant under the same seed to machine precision: the prior `xfail(strict=True)` in `test_bootstrap_seed_reproducibility` is flipped to a passing `assert_allclose(atol=rtol=1e-14)` and parametrized over `[0, 42, 12345]`. A companion `test_bootstrap_seed_reproducibility_local` is added for the local-method bootstrap. It is currently `xfail(strict=True)` because aligning the RNG exposed two separate local-method backend divergences beyond this PR's scope: Rust's `compute_weight_matrix` normalizes time and unit weights to sum to 1, while Python's `_compute_observation_weights` does not; and the Python fallback's `_compute_observation_weights(_precomputed branch)` reads the original-panel cached `Y`/`D` instead of the bootstrap-sample arguments. Both are tracked as follow-up rows in `TODO.md` with file:line pointers and will land in a separate methodology PR. Closes the bootstrap half of silent-failures audit finding #23 (the grid-search half closed in PR #348). Reference: Athey, Imbens, Qu & Viviano (2025), "Triply Robust Panel Estimators", Algorithm 3. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + TODO.md | 3 +- diff_diff/bootstrap_utils.py | 41 ++++++++++ diff_diff/trop_global.py | 54 +++++++------ diff_diff/trop_local.py | 64 +++++++++------- rust/src/trop.rs | 104 +++++++++++++++---------- tests/test_bootstrap_utils.py | 74 ++++++++++++++++++ tests/test_rust_backend.py | 138 +++++++++++++++++++++++++++------- 8 files changed, 359 insertions(+), 120 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58ab0324..fe01ed9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SyntheticDiD `variance_method="bootstrap"` now computes p-values from the analytical normal-theory formula using the bootstrap SE (matching R's `synthdid::vcov()` convention), rather than an empirical null-distribution formula that is not valid for bootstrap draws. `is_significant` and `significance_stars` are derived from `p_value` and will also change for bootstrap fits. Placebo and jackknife are unchanged. Point estimates are unaffected. - SyntheticDiD bootstrap SE formula applies the `sqrt((r-1)/r)` correction matching R's synthdid and the placebo SE formula. - SyntheticDiD bootstrap now retries degenerate resamples (all-control or all-treated, or non-finite `τ_b`) until exactly `n_bootstrap` valid replicates are accumulated, matching R's `synthdid::bootstrap_sample` and Arkhangelsky et al. (2021) Algorithm 2. Previously the Python path counted attempts (with degenerate draws silently dropped), producing fewer valid replicates than requested. A bounded-attempt guard (`20 × n_bootstrap`) prevents pathological-input hangs. +- **TROP global bootstrap SE backend parity under fixed seed** — Rust and Python backends now produce bit-identical bootstrap SE under the same `seed`. Previously Rust's `bootstrap_trop_variance_global` seeded `rand_xoshiro::Xoshiro256PlusPlus` per replicate while Python's fallback consumed `numpy.random.default_rng` (PCG64), producing ~28% SE divergence on tiny panels under `seed=42`. Fixed by extracting a shared `stratified_bootstrap_indices` helper in `diff_diff/bootstrap_utils.py` that pre-generates per-replicate stratified sample indices via numpy on the Python side; both backends consume the same integer arrays through the PyO3 surface. Sampling law (stratified: controls then treated, with replacement) is unchanged. Closes silent-failures audit finding #23 (bootstrap half; grid-search half closed in PR #348). Local-method TROP also adopts the Python-canonical index contract for the RNG layer, but separately-discovered backend divergences (Rust normalizes weight-matrix outer product, Python `_compute_observation_weights` reads stale `_precomputed` cache) prevent local-method bit-identity SE; tracked as a follow-up in `TODO.md`. ### Changed - **SyntheticDiD bootstrap no longer supports survey designs** (capability regression). The removed fixed-weight bootstrap path was the only SDID variance method that supported strata/PSU/FPC (via Rao-Wu rescaled bootstrap); the new paper-faithful refit bootstrap rejects all survey designs (including pweight-only) with `NotImplementedError`. Pweight-only users can switch to `variance_method="placebo"` or `"jackknife"`. Strata/PSU/FPC users have no SDID variance option on this release. Composing Rao-Wu rescaled weights with Frank-Wolfe re-estimation requires a separate derivation (weighted FW solver); sketch and reusable scaffolding pointers are in `docs/methodology/REGISTRY.md` §SyntheticDiD and `TODO.md`. diff --git a/TODO.md b/TODO.md index da80f6ea..ea89877f 100644 --- a/TODO.md +++ b/TODO.md @@ -83,7 +83,8 @@ Deferred items from PR reviews that were not addressed before merge. | Weighted CR2 Bell-McCaffrey cluster-robust (`vcov_type="hc2_bm"` + `cluster_ids` + `weights`) currently raises `NotImplementedError`. Weighted hat matrix and residual rebalancing need threading per clubSandwich WLS handling. | `linalg.py::_compute_cr2_bm` | Phase 1a | Medium | | Regenerate `benchmarks/data/clubsandwich_cr2_golden.json` from R (`Rscript benchmarks/R/generate_clubsandwich_golden.R`). Current JSON has `source: python_self_reference` as a stability anchor until an authoritative R run. | `benchmarks/R/generate_clubsandwich_golden.R` | Phase 1a | Medium | | `honest_did.py:1907` `np.linalg.solve(A_sys, b_sys) / except LinAlgError: continue` is a silent basis-rejection in the vertex-enumeration loop that is algorithmically intentional (try the next basis). Consider surfacing a count of rejected bases as a diagnostic when ARP enumeration exhausts, so users see when the vertex search was heavily constrained. Not a silent failure in the sense of the Phase 2 audit (the algorithm is supposed to skip), but the diagnostic would help debug borderline cases. | `honest_did.py` | #334 | Low | -| TROP Rust vs Python bootstrap SE divergence under fixed seed: `seed=42` on a tiny panel produces ~28% bootstrap-SE gap. Root cause: Rust bootstrap uses its own RNG (`rand` crate) while Python uses `numpy.random.default_rng`; same seed value maps to different bytestreams across backends. Audit axis-H (RNG/seed) adjacent. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_bootstrap_seed_reproducibility` baselines the gap. Unifying RNG (threading a numpy-generated seed-sequence into Rust, or porting Python to ChaCha) would close it. | `trop_global.py`, `rust/` | follow-up | Medium | +| TROP local-method Rust vs Python bootstrap SE divergence beyond RNG: with stratified indices now Python-canonical (closes RNG axis-H), local-method SE still diverges by 10-20% on tiny panels. Two downstream causes: (a) Rust `compute_weight_matrix` (`rust/src/trop.rs:573-606`) normalizes time_weights and unit_weights to sum to 1 before the outer product; Python `_compute_observation_weights` (`diff_diff/trop_local.py:489`) does not normalize. (b) Python `_compute_observation_weights` reads `self._precomputed["Y"]`, `["D"]`, `["time_dist_matrix"]` (lines 423-458) — original-panel cache — rather than the bootstrap-sample data passed via the function arguments, so unit-distance computation in the Python fallback uses stale data. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_bootstrap_seed_reproducibility_local` baselines the gap across `seeds=[0, 42, 12345]`. Affects local-method TROP backend parity broadly, not just bootstrap. | `trop_local.py`, `rust/src/trop.rs` | follow-up | Medium | +| Rust multiplier-bootstrap weight RNG (`generate_bootstrap_weights_batch` in `rust/src/bootstrap.rs:9-10, 57-75`) uses `Xoshiro256PlusPlus::seed_from_u64(seed + i)` per row for Rademacher/Mammen/Webb generation. If any Python caller (SDID / efficient-DiD multiplier bootstrap) has a numpy-canonical equivalent, the two backends likely diverge under the same seed. Audit Python callers (`diff_diff/sdid.py`, `diff_diff/efficient_did_bootstrap.py`, `diff_diff/bootstrap_utils.py::generate_bootstrap_weights_batch_numpy`) for parity-test gaps. Same fix shape as TROP RNG parity (PR #NNN): pre-generate weights in Python via numpy and pass them to Rust through PyO3. | `rust/src/bootstrap.rs`, `diff_diff/bootstrap_utils.py` | follow-up | Medium | | `bias_corrected_local_linear`: extend golden parity to `kernel="triangular"` and `kernel="uniform"` (currently epa-only; all three kernels share `kernel_W` and the `lprobust` math, so parity is expected but not separately asserted). | `benchmarks/R/generate_nprobust_lprobust_golden.R`, `tests/test_bias_corrected_lprobust.py` | Phase 1c | Low | | `bias_corrected_local_linear`: expose `vce in {"hc0", "hc1", "hc2", "hc3"}` on the public wrapper once R parity goldens exist (currently raises `NotImplementedError`). The port-level `lprobust` and `lprobust_res` already support all four; expanding the public surface requires a golden generator for each hc mode and a decision on hc2/hc3 q-fit leverage (R reuses p-fit `hii` for q-fit residuals; whether to match that or stage-match deserves a derivation before the wrapper advertises CCT-2014 conformance). | `diff_diff/local_linear.py::bias_corrected_local_linear`, `benchmarks/R/generate_nprobust_lprobust_golden.R`, `tests/test_bias_corrected_lprobust.py` | Phase 1c | Medium | | `bias_corrected_local_linear`: support `weights=` once survey-design adaptation lands. nprobust's `lprobust` has no weight argument so there is no parity anchor; derivation needed. | `diff_diff/local_linear.py`, `diff_diff/_nprobust_port.py::lprobust` | Phase 1c | Medium | diff --git a/diff_diff/bootstrap_utils.py b/diff_diff/bootstrap_utils.py index d492b8a0..44d94546 100644 --- a/diff_diff/bootstrap_utils.py +++ b/diff_diff/bootstrap_utils.py @@ -24,9 +24,50 @@ "compute_effect_bootstrap_stats", "compute_effect_bootstrap_stats_batch", "warn_bootstrap_failure_rate", + "stratified_bootstrap_indices", ] +def stratified_bootstrap_indices( + rng: np.random.Generator, + n_control: int, + n_treated: int, + n_bootstrap: int, +) -> Tuple[np.ndarray, np.ndarray]: + """Generate stratified bootstrap sample indices. + + Draws controls first, then treated, per replicate. A single ``rng`` + advances through all replicates so Python and Rust consumers share an + identical bytestream under the same seed. + + Parameters + ---------- + rng : np.random.Generator + Seeded numpy generator. Consumed positionally; caller owns seeding. + n_control : int + Size of the control pool. Indices drawn in ``[0, n_control)``. + n_treated : int + Size of the treated pool. Indices drawn in ``[0, n_treated)``. + n_bootstrap : int + Number of bootstrap replicates. + + Returns + ------- + (control_indices, treated_indices) : tuple of np.ndarray + Shapes ``(n_bootstrap, n_control)`` and ``(n_bootstrap, n_treated)``, + dtype ``int64``. Each row is one replicate's sample-with-replacement + indices. Callers map these back to unit identities via their own pool. + """ + control_idx = np.empty((n_bootstrap, n_control), dtype=np.int64) + treated_idx = np.empty((n_bootstrap, n_treated), dtype=np.int64) + for b in range(n_bootstrap): + if n_control > 0: + control_idx[b] = rng.choice(n_control, size=n_control, replace=True) + if n_treated > 0: + treated_idx[b] = rng.choice(n_treated, size=n_treated, replace=True) + return control_idx, treated_idx + + def warn_bootstrap_failure_rate( n_success: int, n_attempted: int, diff --git a/diff_diff/trop_global.py b/diff_diff/trop_global.py index 3d651fd9..568e0eeb 100644 --- a/diff_diff/trop_global.py +++ b/diff_diff/trop_global.py @@ -24,7 +24,10 @@ _rust_bootstrap_trop_variance_global, _rust_loocv_grid_search_global, ) -from diff_diff.bootstrap_utils import warn_bootstrap_failure_rate +from diff_diff.bootstrap_utils import ( + stratified_bootstrap_indices, + warn_bootstrap_failure_rate, +) from diff_diff.trop_local import _soft_threshold_svd, _validate_and_pivot_treatment from diff_diff.trop_results import TROPResults from diff_diff.utils import safe_inference, warn_if_not_converged @@ -961,6 +964,21 @@ def _bootstrap_variance_global( survey_design, ) + # Stratified bootstrap pools (shared by Rust and Python paths) + unit_ever_treated = data.groupby(unit)[treatment].max() + treated_units = np.array(unit_ever_treated[unit_ever_treated == 1].index.tolist()) + control_units = np.array(unit_ever_treated[unit_ever_treated == 0].index.tolist()) + n_treated_units = len(treated_units) + n_control_units = len(control_units) + + # Pre-generate stratified bootstrap indices via numpy (Python-canonical RNG). + # Both backends consume these indices so SE is identical under the same seed + # (silent-failures finding #23, bootstrap half). + rng = np.random.default_rng(self.seed) + control_idx, treated_idx = stratified_bootstrap_indices( + rng, n_control_units, n_treated_units, self.n_bootstrap + ) + # Try Rust backend for parallel bootstrap (5-15x speedup) # Only used for pweight-only designs (no strata/PSU/FPC) if HAS_RUST_BACKEND and _rust_bootstrap_trop_variance_global is not None: @@ -991,7 +1009,8 @@ def _bootstrap_variance_global( self.n_bootstrap, self.max_iter, self.tol, - self.seed if self.seed is not None else 0, + control_idx, + treated_idx, unit_weight_arr, ) @@ -1015,32 +1034,17 @@ def _bootstrap_variance_global( stacklevel=2, ) - # Python fallback implementation - rng = np.random.default_rng(self.seed) - - # Stratified bootstrap sampling - unit_ever_treated = data.groupby(unit)[treatment].max() - treated_units = np.array(unit_ever_treated[unit_ever_treated == 1].index.tolist()) - control_units = np.array(unit_ever_treated[unit_ever_treated == 0].index.tolist()) - - n_treated_units = len(treated_units) - n_control_units = len(control_units) - + # Python fallback: consume the same indices the Rust branch would have used. bootstrap_estimates_list: List[float] = [] nonconverg_tracker: List[int] = [] - for _ in range(self.n_bootstrap): - # Stratified sampling - if n_control_units > 0: - sampled_control = rng.choice(control_units, size=n_control_units, replace=True) - else: - sampled_control = np.array([], dtype=object) - - if n_treated_units > 0: - sampled_treated = rng.choice(treated_units, size=n_treated_units, replace=True) - else: - sampled_treated = np.array([], dtype=object) - + for b in range(self.n_bootstrap): + sampled_control = ( + control_units[control_idx[b]] if n_control_units > 0 else np.array([], dtype=object) + ) + sampled_treated = ( + treated_units[treated_idx[b]] if n_treated_units > 0 else np.array([], dtype=object) + ) sampled_units = np.concatenate([sampled_control, sampled_treated]) # Create bootstrap sample diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py index 1b6bcccf..683cbb9a 100644 --- a/diff_diff/trop_local.py +++ b/diff_diff/trop_local.py @@ -24,7 +24,10 @@ _rust_bootstrap_trop_variance, _rust_unit_distance_matrix, ) -from diff_diff.bootstrap_utils import warn_bootstrap_failure_rate +from diff_diff.bootstrap_utils import ( + stratified_bootstrap_indices, + warn_bootstrap_failure_rate, +) from diff_diff.trop_results import _PrecomputedStructures from diff_diff.utils import warn_if_not_converged @@ -957,6 +960,23 @@ def _bootstrap_variance( survey_design, ) + # Stratified bootstrap pools (shared by Rust and Python paths). + # Paper's Algorithm 3 (page 27) specifies sampling N_0 control rows + # and N_1 treated rows separately to preserve treatment ratio. + unit_ever_treated = data.groupby(unit)[treatment].max() + treated_units = np.array(unit_ever_treated[unit_ever_treated == 1].index) + control_units = np.array(unit_ever_treated[unit_ever_treated == 0].index) + n_treated_units = len(treated_units) + n_control_units = len(control_units) + + # Pre-generate stratified bootstrap indices via numpy (Python-canonical RNG). + # Both backends consume these indices so SE is identical under the same seed + # (silent-failures finding #23, bootstrap half). + rng = np.random.default_rng(self.seed) + control_idx, treated_idx = stratified_bootstrap_indices( + rng, n_control_units, n_treated_units, self.n_bootstrap + ) + # Try Rust backend for parallel bootstrap (5-15x speedup) # Only used for pweight-only designs (no strata/PSU/FPC) if ( @@ -981,7 +1001,8 @@ def _bootstrap_variance( self.n_bootstrap, self.max_iter, self.tol, - self.seed if self.seed is not None else 0, + control_idx, + treated_idx, unit_weight_arr, ) @@ -1003,36 +1024,21 @@ def _bootstrap_variance( stacklevel=2, ) - # Python implementation (fallback) - rng = np.random.default_rng(self.seed) - - # Issue D fix: Stratified bootstrap sampling - # Paper's Algorithm 3 (page 27) specifies sampling N_0 control rows - # and N_1 treated rows separately to preserve treatment ratio - unit_ever_treated = data.groupby(unit)[treatment].max() - treated_units = np.array(unit_ever_treated[unit_ever_treated == 1].index) - control_units = np.array(unit_ever_treated[unit_ever_treated == 0].index) - - n_treated_units = len(treated_units) - n_control_units = len(control_units) - + # Python fallback: consume the same indices the Rust branch would have used. bootstrap_estimates_list = [] nonconverg_tracker: List[int] = [] - for _ in range(self.n_bootstrap): - # Stratified sampling: sample control and treated units separately - # This preserves the treatment ratio in each bootstrap sample - if n_control_units > 0: - sampled_control = rng.choice(control_units, size=n_control_units, replace=True) - else: - sampled_control = np.array([], dtype=control_units.dtype) - - if n_treated_units > 0: - sampled_treated = rng.choice(treated_units, size=n_treated_units, replace=True) - else: - sampled_treated = np.array([], dtype=treated_units.dtype) - - # Combine stratified samples + for b in range(self.n_bootstrap): + sampled_control = ( + control_units[control_idx[b]] + if n_control_units > 0 + else np.array([], dtype=control_units.dtype) + ) + sampled_treated = ( + treated_units[treated_idx[b]] + if n_treated_units > 0 + else np.array([], dtype=treated_units.dtype) + ) sampled_units = np.concatenate([sampled_control, sampled_treated]) # Create bootstrap sample with unique unit IDs diff --git a/rust/src/trop.rs b/rust/src/trop.rs index b1f52e5c..7041a840 100644 --- a/rust/src/trop.rs +++ b/rust/src/trop.rs @@ -924,7 +924,7 @@ fn max_abs_diff_2d(a: &Array2, b: &Array2) -> f64 { /// # Returns /// (bootstrap_estimates, standard_error) #[pyfunction] -#[pyo3(signature = (y, d, control_mask, time_dist_matrix, lambda_time, lambda_unit, lambda_nn, n_bootstrap, max_iter, tol, seed, survey_weights=None))] +#[pyo3(signature = (y, d, control_mask, time_dist_matrix, lambda_time, lambda_unit, lambda_nn, n_bootstrap, max_iter, tol, control_indices, treated_indices, survey_weights=None))] #[allow(clippy::too_many_arguments)] pub fn bootstrap_trop_variance<'py>( py: Python<'py>, @@ -938,13 +938,16 @@ pub fn bootstrap_trop_variance<'py>( n_bootstrap: usize, max_iter: usize, tol: f64, - seed: u64, + control_indices: PyReadonlyArray2<'py, i64>, + treated_indices: PyReadonlyArray2<'py, i64>, survey_weights: Option>, ) -> PyResult<(Bound<'py, PyArray1>, f64)> { let y_arr = y.as_array().to_owned(); let d_arr = d.as_array().to_owned(); let control_mask_arr = control_mask.as_array().to_owned(); let time_dist_arr = time_dist_matrix.as_array().to_owned(); + let ctrl_idx_arr = control_indices.as_array().to_owned(); + let trt_idx_arr = treated_indices.as_array().to_owned(); let sw_arr: Option> = survey_weights.map(|sw| sw.as_array().to_owned()); let n_units = y_arr.ncols(); @@ -965,27 +968,40 @@ pub fn bootstrap_trop_variance<'py>( let n_treated_units = original_treated_units.len(); let n_control_units = original_control_units.len(); + // Validate index-array shapes match the stratified pool sizes + if ctrl_idx_arr.shape() != [n_bootstrap, n_control_units] { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "control_indices shape {:?} does not match (n_bootstrap={}, n_control_units={})", + ctrl_idx_arr.shape(), + n_bootstrap, + n_control_units, + ))); + } + if trt_idx_arr.shape() != [n_bootstrap, n_treated_units] { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "treated_indices shape {:?} does not match (n_bootstrap={}, n_treated_units={})", + trt_idx_arr.shape(), + n_bootstrap, + n_treated_units, + ))); + } + // Run bootstrap iterations in parallel + // RNG-canonical contract: control_indices and treated_indices are pre-generated + // by numpy.random.default_rng(seed) on the Python side via + // diff_diff.bootstrap_utils.stratified_bootstrap_indices, so SE is identical + // across backends under the same seed (silent-failures finding #23). let bootstrap_estimates: Vec = (0..n_bootstrap) .into_par_iter() .filter_map(|b| { - use rand::prelude::*; - use rand_xoshiro::Xoshiro256PlusPlus; - - let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(b as u64)); - - // Issue D fix: Stratified sampling - sample control and treated units separately + // Stratified sampling: consume pre-generated indices for replicate b let mut sampled_units: Vec = Vec::with_capacity(n_units); - - // Sample control units with replacement - for _ in 0..n_control_units { - let idx = rng.gen_range(0..n_control_units); + for j in 0..n_control_units { + let idx = ctrl_idx_arr[[b, j]] as usize; sampled_units.push(original_control_units[idx]); } - - // Sample treated units with replacement - for _ in 0..n_treated_units { - let idx = rng.gen_range(0..n_treated_units); + for j in 0..n_treated_units { + let idx = trt_idx_arr[[b, j]] as usize; sampled_units.push(original_treated_units[idx]); } @@ -1733,7 +1749,7 @@ pub fn loocv_grid_search_global<'py>( /// # Returns /// (bootstrap_estimates, standard_error) #[pyfunction] -#[pyo3(signature = (y, d, lambda_time, lambda_unit, lambda_nn, n_bootstrap, max_iter, tol, seed, survey_weights=None))] +#[pyo3(signature = (y, d, lambda_time, lambda_unit, lambda_nn, n_bootstrap, max_iter, tol, control_indices, treated_indices, survey_weights=None))] #[allow(clippy::too_many_arguments)] pub fn bootstrap_trop_variance_global<'py>( py: Python<'py>, @@ -1745,11 +1761,14 @@ pub fn bootstrap_trop_variance_global<'py>( n_bootstrap: usize, max_iter: usize, tol: f64, - seed: u64, + control_indices: PyReadonlyArray2<'py, i64>, + treated_indices: PyReadonlyArray2<'py, i64>, survey_weights: Option>, ) -> PyResult<(Bound<'py, PyArray1>, f64)> { let y_arr = y.as_array().to_owned(); let d_arr = d.as_array().to_owned(); + let ctrl_idx_arr = control_indices.as_array().to_owned(); + let trt_idx_arr = treated_indices.as_array().to_owned(); let sw_arr: Option> = survey_weights.map(|sw| sw.as_array().to_owned()); let n_units = y_arr.ncols(); @@ -1769,6 +1788,24 @@ pub fn bootstrap_trop_variance_global<'py>( let n_treated_units = original_treated_units.len(); let n_control_units = original_control_units.len(); + // Validate index-array shapes match the stratified pool sizes + if ctrl_idx_arr.shape() != [n_bootstrap, n_control_units] { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "control_indices shape {:?} does not match (n_bootstrap={}, n_control_units={})", + ctrl_idx_arr.shape(), + n_bootstrap, + n_control_units, + ))); + } + if trt_idx_arr.shape() != [n_bootstrap, n_treated_units] { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "treated_indices shape {:?} does not match (n_bootstrap={}, n_treated_units={})", + trt_idx_arr.shape(), + n_bootstrap, + n_treated_units, + ))); + } + // Determine treated periods from D matrix let mut first_treat_period = n_periods; for t in 0..n_periods { @@ -1785,31 +1822,22 @@ pub fn bootstrap_trop_variance_global<'py>( let ln_eff = if lambda_nn.is_infinite() { 1e10 } else { lambda_nn }; // Run bootstrap iterations in parallel + // RNG-canonical contract: control_indices and treated_indices are pre-generated + // by numpy.random.default_rng(seed) on the Python side via + // diff_diff.bootstrap_utils.stratified_bootstrap_indices, so SE is identical + // across backends under the same seed (silent-failures finding #23). let bootstrap_estimates: Vec = (0..n_bootstrap) .into_par_iter() .filter_map(|b| { - use rand::prelude::*; - use rand_xoshiro::Xoshiro256PlusPlus; - - let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(b as u64)); - - // Stratified sampling - sample control and treated units separately + // Stratified sampling: consume pre-generated indices for replicate b let mut sampled_units: Vec = Vec::with_capacity(n_units); - - // Sample control units with replacement - for _ in 0..n_control_units { - if n_control_units > 0 { - let idx = rng.gen_range(0..n_control_units); - sampled_units.push(original_control_units[idx]); - } + for j in 0..n_control_units { + let idx = ctrl_idx_arr[[b, j]] as usize; + sampled_units.push(original_control_units[idx]); } - - // Sample treated units with replacement - for _ in 0..n_treated_units { - if n_treated_units > 0 { - let idx = rng.gen_range(0..n_treated_units); - sampled_units.push(original_treated_units[idx]); - } + for j in 0..n_treated_units { + let idx = trt_idx_arr[[b, j]] as usize; + sampled_units.push(original_treated_units[idx]); } // Create bootstrap matrices by selecting columns diff --git a/tests/test_bootstrap_utils.py b/tests/test_bootstrap_utils.py index 9746bc3e..d4cb7ae1 100644 --- a/tests/test_bootstrap_utils.py +++ b/tests/test_bootstrap_utils.py @@ -8,6 +8,7 @@ from diff_diff.bootstrap_utils import ( compute_effect_bootstrap_stats, compute_effect_bootstrap_stats_batch, + stratified_bootstrap_indices, warn_bootstrap_failure_rate, ) @@ -193,3 +194,76 @@ def test_all_failed_warns(self): """0/N replicates succeeded — caller handles NaN return, but the warning fires.""" with pytest.warns(UserWarning, match=r"0/50 bootstrap iterations"): warn_bootstrap_failure_rate(n_success=0, n_attempted=50, context="test case") + + +class TestStratifiedBootstrapIndices: + """Shared stratified-bootstrap index helper used by TROP Rust + Python paths. + + Pinning these invariants matters because both TROP backends now consume + the helper's output directly; any drift in shape, dtype, or draw order + would silently break backend parity (silent-failures audit finding #23). + """ + + def test_shapes_and_dtype(self): + rng = np.random.default_rng(0) + ctrl, trt = stratified_bootstrap_indices(rng, n_control=5, n_treated=3, n_bootstrap=7) + assert ctrl.shape == (7, 5) + assert trt.shape == (7, 3) + assert ctrl.dtype == np.int64 + assert trt.dtype == np.int64 + + def test_value_range(self): + rng = np.random.default_rng(123) + ctrl, trt = stratified_bootstrap_indices(rng, n_control=4, n_treated=6, n_bootstrap=50) + assert ctrl.min() >= 0 and ctrl.max() < 4 + assert trt.min() >= 0 and trt.max() < 6 + + def test_determinism(self): + ctrl_a, trt_a = stratified_bootstrap_indices(np.random.default_rng(42), 3, 2, 5) + ctrl_b, trt_b = stratified_bootstrap_indices(np.random.default_rng(42), 3, 2, 5) + np.testing.assert_array_equal(ctrl_a, ctrl_b) + np.testing.assert_array_equal(trt_a, trt_b) + + def test_prefix_invariance(self): + """n_bootstrap=N prefix must match first N rows of n_bootstrap=M>N. + + Pins the sequential-per-replicate consumption law: one rng advances + through all replicates in order, so extending the loop only appends. + """ + ctrl_short, trt_short = stratified_bootstrap_indices(np.random.default_rng(7), 4, 3, 10) + ctrl_long, trt_long = stratified_bootstrap_indices(np.random.default_rng(7), 4, 3, 100) + np.testing.assert_array_equal(ctrl_short, ctrl_long[:10]) + np.testing.assert_array_equal(trt_short, trt_long[:10]) + + def test_value_pin_default_rng_42(self): + """Hard-coded byte-level pin. Catches silent draw-order drift. + + Any refactor that reorders the draws (e.g. treated-then-control, + vectorized single call, or a new rng primitive) will break this. + """ + rng = np.random.default_rng(42) + ctrl, trt = stratified_bootstrap_indices(rng, n_control=3, n_treated=2, n_bootstrap=5) + expected_ctrl = np.array( + [[0, 2, 1], [2, 0, 2], [1, 2, 2], [2, 1, 0], [1, 1, 0]], + dtype=np.int64, + ) + expected_trt = np.array( + [[0, 0], [0, 0], [1, 1], [1, 0], [1, 1]], + dtype=np.int64, + ) + np.testing.assert_array_equal(ctrl, expected_ctrl) + np.testing.assert_array_equal(trt, expected_trt) + + def test_empty_control_pool(self): + rng = np.random.default_rng(1) + ctrl, trt = stratified_bootstrap_indices(rng, n_control=0, n_treated=3, n_bootstrap=4) + assert ctrl.shape == (4, 0) + assert trt.shape == (4, 3) + assert trt.min() >= 0 and trt.max() < 3 + + def test_empty_treated_pool(self): + rng = np.random.default_rng(1) + ctrl, trt = stratified_bootstrap_indices(rng, n_control=3, n_treated=0, n_bootstrap=4) + assert ctrl.shape == (4, 3) + assert trt.shape == (4, 0) + assert ctrl.min() >= 0 and ctrl.max() < 3 diff --git a/tests/test_rust_backend.py b/tests/test_rust_backend.py index 228565d6..0433e78b 100644 --- a/tests/test_rust_backend.py +++ b/tests/test_rust_backend.py @@ -951,6 +951,14 @@ def test_loocv_grid_search_returns_valid_params(self): # Check first_failed is None or a valid (unit, time) tuple assert first_failed is None or (isinstance(first_failed, tuple) and len(first_failed) == 2) + @staticmethod + def _stratified_indices(n_control, n_treated, n_bootstrap, seed): + """Build stratified bootstrap index arrays via the shared helper.""" + from diff_diff.bootstrap_utils import stratified_bootstrap_indices + + rng = np.random.default_rng(seed) + return stratified_bootstrap_indices(rng, n_control, n_treated, n_bootstrap) + def test_bootstrap_variance_shape(self): """Test bootstrap returns correct shapes.""" from diff_diff._rust_backend import bootstrap_trop_variance @@ -969,10 +977,14 @@ def test_bootstrap_variance_shape(self): ).astype(np.int64) n_bootstrap = 20 + # Stratified pools: 1 treated unit (index 0), 5 control units + ctrl_idx, trt_idx = self._stratified_indices( + n_control=5, n_treated=1, n_bootstrap=n_bootstrap, seed=42 + ) estimates, se = bootstrap_trop_variance( Y, D, control_mask, time_dist, 1.0, 1.0, 0.1, # lambda values - n_bootstrap, 100, 1e-6, 42 + n_bootstrap, 100, 1e-6, ctrl_idx, trt_idx, ) # Should return array of bootstrap estimates and SE @@ -996,14 +1008,20 @@ def test_bootstrap_reproducibility(self): np.arange(n_periods)[:, np.newaxis] - np.arange(n_periods)[np.newaxis, :] ).astype(np.int64) - # Run twice with same seed + # Run twice with same seed (helper is deterministic given the same seed) + ctrl_idx_a, trt_idx_a = self._stratified_indices( + n_control=5, n_treated=1, n_bootstrap=20, seed=42 + ) + ctrl_idx_b, trt_idx_b = self._stratified_indices( + n_control=5, n_treated=1, n_bootstrap=20, seed=42 + ) est1, se1 = bootstrap_trop_variance( Y, D, control_mask, time_dist, - 1.0, 1.0, 0.1, 20, 100, 1e-6, 42 + 1.0, 1.0, 0.1, 20, 100, 1e-6, ctrl_idx_a, trt_idx_a, ) est2, se2 = bootstrap_trop_variance( Y, D, control_mask, time_dist, - 1.0, 1.0, 0.1, 20, 100, 1e-6, 42 + 1.0, 1.0, 0.1, 20, 100, 1e-6, ctrl_idx_b, trt_idx_b, ) np.testing.assert_array_almost_equal(est1, est2) @@ -1171,6 +1189,14 @@ def test_loocv_grid_search_global_reproducible(self): # Without subsampling, results should be deterministic assert result1[:4] == result2[:4] + @staticmethod + def _global_stratified_indices(n_control, n_treated, n_bootstrap, seed): + """Build stratified bootstrap index arrays via the shared helper.""" + from diff_diff.bootstrap_utils import stratified_bootstrap_indices + + rng = np.random.default_rng(seed) + return stratified_bootstrap_indices(rng, n_control, n_treated, n_bootstrap) + def test_bootstrap_trop_variance_global_shape(self): """Test bootstrap_trop_variance_global returns valid output.""" from diff_diff._rust_backend import bootstrap_trop_variance_global @@ -1184,8 +1210,12 @@ def test_bootstrap_trop_variance_global_shape(self): D = np.zeros((n_periods, n_units)) D[-n_post:, :n_treated] = 1.0 + # Stratified pools: 4 treated units, 11 control units + ctrl_idx, trt_idx = self._global_stratified_indices( + n_control=n_units - n_treated, n_treated=n_treated, n_bootstrap=50, seed=42 + ) estimates, se = bootstrap_trop_variance_global( - Y, D, 0.5, 0.5, 0.1, 50, 50, 1e-6, 42 + Y, D, 0.5, 0.5, 0.1, 50, 50, 1e-6, ctrl_idx, trt_idx, ) assert isinstance(estimates, np.ndarray) @@ -1206,11 +1236,17 @@ def test_bootstrap_trop_variance_global_reproducible(self): D = np.zeros((n_periods, n_units)) D[-n_post:, :n_treated] = 1.0 + ctrl_a, trt_a = self._global_stratified_indices( + n_control=n_units - n_treated, n_treated=n_treated, n_bootstrap=50, seed=42 + ) + ctrl_b, trt_b = self._global_stratified_indices( + n_control=n_units - n_treated, n_treated=n_treated, n_bootstrap=50, seed=42 + ) est1, se1 = bootstrap_trop_variance_global( - Y, D, 0.5, 0.5, 0.1, 50, 50, 1e-6, 42 + Y, D, 0.5, 0.5, 0.1, 50, 50, 1e-6, ctrl_a, trt_a, ) est2, se2 = bootstrap_trop_variance_global( - Y, D, 0.5, 0.5, 0.1, 50, 50, 1e-6, 42 + Y, D, 0.5, 0.5, 0.1, 50, 50, 1e-6, ctrl_b, trt_b, ) np.testing.assert_array_almost_equal(est1, est2) @@ -2253,22 +2289,17 @@ def test_grid_search_rank_deficient_Y(self): f"Rust={res_rust.att:.8f}, Python={res_py.att:.8f}", ) - @pytest.mark.xfail( - strict=True, - reason="Rust bootstrap uses its own RNG (rand crate) while Python uses " - "numpy's default_rng; same `seed=42` produces different bootstrap " - "replicate indices. Empirical SE divergence ~28%. Unifying the RNG " - "across backends is a P1 follow-up (TODO.md). This xfail baselines " - "the gap so we notice if/when the backends share an RNG seed-to-" - "bytestream mapping. Grid-search ATT parity is validated in the " - "previous test (rank-deficient Y path).", - ) - def test_bootstrap_seed_reproducibility(self): - """Bootstrap SE parity under a fixed seed. - - Known to fail: Rust and Python use different RNG backends, so - identical ``seed=42`` produces different bootstrap replicates. - See xfail reason for follow-up plan. + @pytest.mark.parametrize("seed", [0, 42, 12345]) + def test_bootstrap_seed_reproducibility(self, seed): + """Bootstrap SE parity under a fixed seed (global method). + + Silent-failures audit Finding #23 (bootstrap half) regression guard. + Previously a ~28% SE divergence on tiny panels because Rust seeded + ``rand_xoshiro::Xoshiro256PlusPlus`` per replicate while Python + consumed ``numpy.random.default_rng`` (PCG64). Fixed by pre-generating + stratified bootstrap indices via numpy on the Python side and passing + them to Rust through the PyO3 surface (Python-canonical RNG); both + backends now consume bit-identical index streams under the same seed. """ import sys from unittest.mock import patch @@ -2282,7 +2313,7 @@ def test_bootstrap_seed_reproducibility(self): lambda_unit_grid=[1.0], lambda_nn_grid=[np.inf], n_bootstrap=10, - seed=42, + seed=seed, ) trop_rust = TROP(**trop_params) @@ -2296,9 +2327,62 @@ def test_bootstrap_seed_reproducibility(self): res_py = trop_py.fit(df.copy(), "outcome", "treated", "unit", "time") np.testing.assert_allclose( - res_rust.se, res_py.se, atol=1e-8, - err_msg=f"Bootstrap SE divergence under seed=42: " - f"Rust={res_rust.se:.10f}, Python={res_py.se:.10f}", + res_rust.se, res_py.se, atol=1e-14, rtol=1e-14, + err_msg=f"Bootstrap SE divergence under seed={seed}: " + f"Rust={res_rust.se:.16f}, Python={res_py.se:.16f}", + ) + + @pytest.mark.xfail( + strict=True, + reason="Local-method TROP has additional Rust-vs-Python backend " + "divergences beyond RNG (weight-matrix normalization in " + "`compute_weight_matrix` is Rust-only; Python " + "`_compute_observation_weights` reads `self._precomputed['Y']` " + "instead of the bootstrap-sample `Y`). RNG parity infrastructure " + "is in place (both backends now consume the same Python-canonical " + "stratified indices), but post-resampling computations diverge so " + "SE is not yet bit-identical. Tracked as a separate finding in " + "TODO.md. This xfail baselines the gap so we notice when both " + "downstream divergences are closed.", + ) + @pytest.mark.parametrize("seed", [0, 42, 12345]) + def test_bootstrap_seed_reproducibility_local(self, seed): + """Bootstrap SE parity under a fixed seed (local method). + + Currently xfail(strict=True) — see decorator. The local-method + path has additional backend divergences beyond RNG that this PR + does not address. Companion to the global-method parity test + above; once the downstream divergences close, this test will + XPASS and should be promoted to a regression guard. + """ + import sys + from unittest.mock import patch + + from diff_diff import TROP + + df = self._make_correlated_panel(n_units=6, n_periods=6, n_treated=2) + trop_params = dict( + method="local", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=10, + seed=seed, + ) + + trop_rust = TROP(**trop_params) + res_rust = trop_rust.fit(df.copy(), "outcome", "treated", "unit", "time") + + trop_local_module = sys.modules["diff_diff.trop_local"] + with patch.object(trop_local_module, "HAS_RUST_BACKEND", False), \ + patch.object(trop_local_module, "_rust_bootstrap_trop_variance", None): + trop_py = TROP(**trop_params) + res_py = trop_py.fit(df.copy(), "outcome", "treated", "unit", "time") + + np.testing.assert_allclose( + res_rust.se, res_py.se, atol=1e-14, rtol=1e-14, + err_msg=f"Local-method bootstrap SE divergence under seed={seed}: " + f"Rust={res_rust.se:.16f}, Python={res_py.se:.16f}", ) From 5ce5b8f05cac9b4be4fc8a0d7ad3cb428d599750 Mon Sep 17 00:00:00 2001 From: igerber Date: Thu, 23 Apr 2026 21:02:10 -0400 Subject: [PATCH 2/2] Address PR #354 AI review: validate Rust index inputs + doc fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 — Rust `bootstrap_trop_variance` and `bootstrap_trop_variance_global` now validate every element of `control_indices` and `treated_indices` before entering the parallel loop. Negative values and values `>= pool size` raise `PyValueError` with the offending index, rather than panicking on a negative-to-`usize` cast or indexing `original_*_units` out of bounds inside the rayon closure. P3 — `rust/src/trop.rs` function header docs rewritten for both bootstrap entry points. Removed the stale `seed` argument; added description of `control_indices` and `treated_indices` shape/dtype/ range, and documented the Python-canonical RNG contract via `diff_diff.bootstrap_utils.stratified_bootstrap_indices`. P3 — `diff_diff/trop_local.py` comment on the pre-generate block rewritten to state that local-method SE is NOT yet bit-identical across backends (only the RNG layer is aligned; two downstream divergences remain in `compute_weight_matrix` normalization and the Python `_compute_observation_weights` stale-cache path; tracked in `TODO.md`). P3 — New `tests/test_rust_backend.py` coverage: `TestTROPRustBackend::test_bootstrap_rejects_negative_index`, `test_bootstrap_rejects_out_of_range_index`, and the corresponding `TestTROPGlobalRustBackend` pair each assert `ValueError` with the expected regex for malformed inputs on both local and global bootstrap functions. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/trop_local.py | 9 +++- rust/src/trop.rs | 85 +++++++++++++++++++++++++++++++++--- tests/test_rust_backend.py | 88 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+), 8 deletions(-) diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py index 683cbb9a..39cce9fc 100644 --- a/diff_diff/trop_local.py +++ b/diff_diff/trop_local.py @@ -970,8 +970,13 @@ def _bootstrap_variance( n_control_units = len(control_units) # Pre-generate stratified bootstrap indices via numpy (Python-canonical RNG). - # Both backends consume these indices so SE is identical under the same seed - # (silent-failures finding #23, bootstrap half). + # This aligns the RNG layer between backends. For the global method the RNG + # was the only divergence, so global SE is now bit-identical under the same + # seed. For the local method two downstream divergences remain (Rust weight- + # matrix normalization + Python `_compute_observation_weights` reading the + # stale `_precomputed` cache) — tracked in TODO.md; until those land, local + # bootstrap SE still differs across backends. Silent-failures finding #23 + # (bootstrap half) closed for global; local follow-up queued. rng = np.random.default_rng(self.seed) control_idx, treated_idx = stratified_bootstrap_indices( rng, n_control_units, n_treated_units, self.n_bootstrap diff --git a/rust/src/trop.rs b/rust/src/trop.rs index 7041a840..ac98b44f 100644 --- a/rust/src/trop.rs +++ b/rust/src/trop.rs @@ -897,7 +897,7 @@ fn max_abs_diff_2d(a: &Array2, b: &Array2) -> f64 { .fold(0.0_f64, f64::max) } -/// Compute bootstrap variance estimation for TROP in parallel. +/// Compute bootstrap variance estimation for TROP in parallel (local method). /// /// Performs unit-level block bootstrap, parallelizing across bootstrap iterations. /// @@ -905,9 +905,6 @@ fn max_abs_diff_2d(a: &Array2, b: &Array2) -> f64 { /// * `y` - Outcome matrix (n_periods x n_units) /// * `d` - Treatment indicator matrix (n_periods x n_units) /// * `control_mask` - Boolean mask for control observations -/// * `control_unit_idx` - Array of control unit indices -/// * `treated_obs` - List of (t, i) treated observations -/// * `unit_dist_matrix` - Pre-computed unit distance matrix /// * `time_dist_matrix` - Pre-computed time distance matrix /// * `lambda_time` - Selected time decay parameter /// * `lambda_unit` - Selected unit distance parameter @@ -915,12 +912,24 @@ fn max_abs_diff_2d(a: &Array2, b: &Array2) -> f64 { /// * `n_bootstrap` - Number of bootstrap iterations /// * `max_iter` - Maximum iterations for model estimation /// * `tol` - Convergence tolerance -/// * `seed` - Random seed +/// * `control_indices` - Pre-generated stratified bootstrap indices for the +/// control pool, shape `(n_bootstrap, n_control_units)`, dtype `i64`. +/// Values must be in `[0, n_control_units)`. +/// * `treated_indices` - Pre-generated stratified bootstrap indices for the +/// treated pool, shape `(n_bootstrap, n_treated_units)`, dtype `i64`. +/// Values must be in `[0, n_treated_units)`. /// * `survey_weights` - Optional unit-level survey weights (length n_units). /// When provided, ATT is computed as a weighted mean of per-observation /// treatment effects using unit weights. Model fitting, LOOCV, and distance /// computation are unchanged. /// +/// The index arrays carry the RNG contract: they are produced on the Python +/// side by `diff_diff.bootstrap_utils.stratified_bootstrap_indices` with a +/// numpy `default_rng(seed)`, so Rust and Python consumers see identical +/// sampling under the same seed. Invalid index values (negative or out of +/// range) raise a `PyValueError` rather than silently producing malformed +/// bootstrap samples. +/// /// # Returns /// (bootstrap_estimates, standard_error) #[pyfunction] @@ -986,6 +995,32 @@ pub fn bootstrap_trop_variance<'py>( ))); } + // Validate index values are in range. Fail fast with a clean PyValueError + // rather than panicking inside the parallel loop on a negative cast or an + // out-of-pool Vec index. + if n_control_units > 0 { + let n_ctrl = n_control_units as i64; + for v in ctrl_idx_arr.iter() { + if *v < 0 || *v >= n_ctrl { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "control_indices contains out-of-range value {} (valid: [0, {}))", + v, n_control_units, + ))); + } + } + } + if n_treated_units > 0 { + let n_trt = n_treated_units as i64; + for v in trt_idx_arr.iter() { + if *v < 0 || *v >= n_trt { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "treated_indices contains out-of-range value {} (valid: [0, {}))", + v, n_treated_units, + ))); + } + } + } + // Run bootstrap iterations in parallel // RNG-canonical contract: control_indices and treated_indices are pre-generated // by numpy.random.default_rng(seed) on the Python side via @@ -1740,12 +1775,24 @@ pub fn loocv_grid_search_global<'py>( /// * `n_bootstrap` - Number of bootstrap iterations /// * `max_iter` - Maximum iterations for model estimation /// * `tol` - Convergence tolerance -/// * `seed` - Random seed +/// * `control_indices` - Pre-generated stratified bootstrap indices for the +/// control pool, shape `(n_bootstrap, n_control_units)`, dtype `i64`. +/// Values must be in `[0, n_control_units)`. +/// * `treated_indices` - Pre-generated stratified bootstrap indices for the +/// treated pool, shape `(n_bootstrap, n_treated_units)`, dtype `i64`. +/// Values must be in `[0, n_treated_units)`. /// * `survey_weights` - Optional unit-level survey weights (length n_units). /// When provided, ATT is computed as a weighted mean of per-observation /// treatment effects using unit weights. Model fitting, LOOCV, and distance /// computation are unchanged. /// +/// The index arrays carry the RNG contract: they are produced on the Python +/// side by `diff_diff.bootstrap_utils.stratified_bootstrap_indices` with a +/// numpy `default_rng(seed)`, so Rust and Python consumers see identical +/// sampling under the same seed. Invalid index values (negative or out of +/// range) raise a `PyValueError` rather than silently producing malformed +/// bootstrap samples. +/// /// # Returns /// (bootstrap_estimates, standard_error) #[pyfunction] @@ -1806,6 +1853,32 @@ pub fn bootstrap_trop_variance_global<'py>( ))); } + // Validate index values are in range. Fail fast with a clean PyValueError + // rather than panicking inside the parallel loop on a negative cast or an + // out-of-pool Vec index. + if n_control_units > 0 { + let n_ctrl = n_control_units as i64; + for v in ctrl_idx_arr.iter() { + if *v < 0 || *v >= n_ctrl { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "control_indices contains out-of-range value {} (valid: [0, {}))", + v, n_control_units, + ))); + } + } + } + if n_treated_units > 0 { + let n_trt = n_treated_units as i64; + for v in trt_idx_arr.iter() { + if *v < 0 || *v >= n_trt { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "treated_indices contains out-of-range value {} (valid: [0, {}))", + v, n_treated_units, + ))); + } + } + } + // Determine treated periods from D matrix let mut first_treat_period = n_periods; for t in 0..n_periods { diff --git a/tests/test_rust_backend.py b/tests/test_rust_backend.py index 0433e78b..1ecc8af8 100644 --- a/tests/test_rust_backend.py +++ b/tests/test_rust_backend.py @@ -1027,6 +1027,54 @@ def test_bootstrap_reproducibility(self): np.testing.assert_array_almost_equal(est1, est2) assert abs(se1 - se2) < 1e-10 + def test_bootstrap_rejects_negative_index(self): + """Rust local bootstrap must raise PyValueError on a negative index.""" + from diff_diff._rust_backend import bootstrap_trop_variance + + np.random.seed(42) + n_periods, n_units = 8, 6 + Y = np.random.randn(n_periods, n_units) + D = np.zeros((n_periods, n_units)) + D[6:, 0] = 1.0 + control_mask = (D == 0).astype(np.uint8) + time_dist = np.abs( + np.arange(n_periods)[:, np.newaxis] - np.arange(n_periods)[np.newaxis, :] + ).astype(np.int64) + + ctrl_idx, trt_idx = self._stratified_indices( + n_control=5, n_treated=1, n_bootstrap=5, seed=0 + ) + ctrl_idx[2, 3] = -1 # negative + with pytest.raises(ValueError, match="control_indices.*out-of-range"): + bootstrap_trop_variance( + Y, D, control_mask, time_dist, + 1.0, 1.0, 0.1, 5, 100, 1e-6, ctrl_idx, trt_idx, + ) + + def test_bootstrap_rejects_out_of_range_index(self): + """Rust local bootstrap must raise PyValueError on an index >= pool size.""" + from diff_diff._rust_backend import bootstrap_trop_variance + + np.random.seed(42) + n_periods, n_units = 8, 6 + Y = np.random.randn(n_periods, n_units) + D = np.zeros((n_periods, n_units)) + D[6:, 0] = 1.0 + control_mask = (D == 0).astype(np.uint8) + time_dist = np.abs( + np.arange(n_periods)[:, np.newaxis] - np.arange(n_periods)[np.newaxis, :] + ).astype(np.int64) + + ctrl_idx, trt_idx = self._stratified_indices( + n_control=5, n_treated=1, n_bootstrap=5, seed=0 + ) + trt_idx[1, 0] = 99 # >> n_treated=1 + with pytest.raises(ValueError, match="treated_indices.*out-of-range"): + bootstrap_trop_variance( + Y, D, control_mask, time_dist, + 1.0, 1.0, 0.1, 5, 100, 1e-6, ctrl_idx, trt_idx, + ) + @pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") class TestTROPRustVsNumpy: @@ -1252,6 +1300,46 @@ def test_bootstrap_trop_variance_global_reproducible(self): np.testing.assert_array_almost_equal(est1, est2) np.testing.assert_almost_equal(se1, se2) + def test_bootstrap_global_rejects_negative_index(self): + """Rust global bootstrap must raise PyValueError on a negative index.""" + from diff_diff._rust_backend import bootstrap_trop_variance_global + + np.random.seed(42) + n_periods, n_units = 8, 15 + n_treated = 4 + Y = np.random.randn(n_periods, n_units) + D = np.zeros((n_periods, n_units)) + D[-2:, :n_treated] = 1.0 + + ctrl_idx, trt_idx = self._global_stratified_indices( + n_control=n_units - n_treated, n_treated=n_treated, n_bootstrap=10, seed=0 + ) + ctrl_idx[3, 2] = -5 # negative + with pytest.raises(ValueError, match="control_indices.*out-of-range"): + bootstrap_trop_variance_global( + Y, D, 0.5, 0.5, 0.1, 10, 50, 1e-6, ctrl_idx, trt_idx, + ) + + def test_bootstrap_global_rejects_out_of_range_index(self): + """Rust global bootstrap must raise PyValueError on an index >= pool size.""" + from diff_diff._rust_backend import bootstrap_trop_variance_global + + np.random.seed(42) + n_periods, n_units = 8, 15 + n_treated = 4 + Y = np.random.randn(n_periods, n_units) + D = np.zeros((n_periods, n_units)) + D[-2:, :n_treated] = 1.0 + + ctrl_idx, trt_idx = self._global_stratified_indices( + n_control=n_units - n_treated, n_treated=n_treated, n_bootstrap=10, seed=0 + ) + trt_idx[5, 1] = 99 # >> n_treated=4 + with pytest.raises(ValueError, match="treated_indices.*out-of-range"): + bootstrap_trop_variance_global( + Y, D, 0.5, 0.5, 0.1, 10, 50, 1e-6, ctrl_idx, trt_idx, + ) + @pytest.mark.slow @pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available")