diff --git a/TODO.md b/TODO.md index 8cc3475c..344ddd00 100644 --- a/TODO.md +++ b/TODO.md @@ -83,7 +83,7 @@ Deferred items from PR reviews that were not addressed before merge. | Weighted CR2 Bell-McCaffrey cluster-robust (`vcov_type="hc2_bm"` + `cluster_ids` + `weights`) currently raises `NotImplementedError`. Weighted hat matrix and residual rebalancing need threading per clubSandwich WLS handling. | `linalg.py::_compute_cr2_bm` | Phase 1a | Medium | | Regenerate `benchmarks/data/clubsandwich_cr2_golden.json` from R (`Rscript benchmarks/R/generate_clubsandwich_golden.R`). Current JSON has `source: python_self_reference` as a stability anchor until an authoritative R run. | `benchmarks/R/generate_clubsandwich_golden.R` | Phase 1a | Medium | | `honest_did.py:1907` `np.linalg.solve(A_sys, b_sys) / except LinAlgError: continue` is a silent basis-rejection in the vertex-enumeration loop that is algorithmically intentional (try the next basis). Consider surfacing a count of rejected bases as a diagnostic when ARP enumeration exhausts, so users see when the vertex search was heavily constrained. Not a silent failure in the sense of the Phase 2 audit (the algorithm is supposed to skip), but the diagnostic would help debug borderline cases. | `honest_did.py` | #334 | Low | -| `compute_synthetic_weights` backend algorithm mismatch: Rust path uses Frank-Wolfe (`_rust_synthetic_weights` in `utils.py:1184`); Python fallback uses projected gradient descent (`_compute_synthetic_weights_numpy` in `utils.py:1228`). Both solve the same constrained QP but converge to different simplex vertices on near-degenerate / extreme-scale inputs (e.g. `Y~1e9`, or near-singular `Y'Y`). Unified backend (one algorithm) would close the parity gap surfaced by audit finding #22. Two `@pytest.mark.xfail(strict=True)` tests in `tests/test_rust_backend.py::TestSyntheticWeightsBackendParity` baseline the divergence so we notice when/if the algorithms align. | `utils.py`, `rust/` | follow-up | Medium | +| Rust `compute_synthetic_weights` + `compute_synthetic_weights_internal` (now dead code) can be removed from `rust/src/weights.rs:43-117` in a future Rust-cleanup PR. Python-side wrapper was deleted (post-audit cleanup for finding #22) and its sole caller now inlines Frank-Wolfe via `_sc_weight_fw`. The Rust symbol remains callable via `from diff_diff._rust_backend import compute_synthetic_weights` but no Python code calls it. Removal requires `maturin develop` rebuild. No functional impact of leaving it. | `rust/src/weights.rs` | follow-up | Low | | TROP Rust vs Python grid-search divergence on rank-deficient Y: on two near-parallel control units, LOOCV grid-search ATT diverges ~6% between Rust (`trop_global.py:688`) and Python fallback (`trop_global.py:753`). Either grid-winner ties are broken differently or the per-λ solver reaches different stationary points under rank deficiency. Audit finding #23 flagged this surface. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_grid_search_rank_deficient_Y` baselines the gap. | `trop_global.py`, `rust/` | follow-up | Medium | | TROP Rust vs Python bootstrap SE divergence under fixed seed: `seed=42` on a tiny panel produces ~28% bootstrap-SE gap. Root cause: Rust bootstrap uses its own RNG (`rand` crate) while Python uses `numpy.random.default_rng`; same seed value maps to different bytestreams across backends. Audit axis-H (RNG/seed) adjacent. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_bootstrap_seed_reproducibility` baselines the gap. Unifying RNG (threading a numpy-generated seed-sequence into Rust, or porting Python to ChaCha) would close it. | `trop_global.py`, `rust/` | follow-up | Medium | diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py index 3241ec92..6c41d39d 100644 --- a/diff_diff/__init__.py +++ b/diff_diff/__init__.py @@ -21,7 +21,6 @@ _rust_compute_robust_vcov, _rust_project_simplex, _rust_solve_ols, - _rust_synthetic_weights, ) from diff_diff.bacon import ( diff --git a/diff_diff/_backend.py b/diff_diff/_backend.py index b3931b72..cfcffe7f 100644 --- a/diff_diff/_backend.py +++ b/diff_diff/_backend.py @@ -19,7 +19,6 @@ try: from diff_diff._rust_backend import ( generate_bootstrap_weights_batch as _rust_bootstrap_weights, - compute_synthetic_weights as _rust_synthetic_weights, project_simplex as _rust_project_simplex, solve_ols as _rust_solve_ols, compute_robust_vcov as _rust_compute_robust_vcov, @@ -43,7 +42,6 @@ except ImportError: _rust_available = False _rust_bootstrap_weights = None - _rust_synthetic_weights = None _rust_project_simplex = None _rust_solve_ols = None _rust_compute_robust_vcov = None @@ -66,7 +64,6 @@ # Force pure Python mode - disable Rust even if available HAS_RUST_BACKEND = False _rust_bootstrap_weights = None - _rust_synthetic_weights = None _rust_project_simplex = None _rust_solve_ols = None _rust_compute_robust_vcov = None @@ -115,7 +112,6 @@ def rust_backend_info(): "HAS_RUST_BACKEND", "rust_backend_info", "_rust_bootstrap_weights", - "_rust_synthetic_weights", "_rust_project_simplex", "_rust_solve_ols", "_rust_compute_robust_vcov", diff --git a/diff_diff/prep.py b/diff_diff/prep.py index 70201144..e942cab9 100644 --- a/diff_diff/prep.py +++ b/diff_diff/prep.py @@ -9,6 +9,7 @@ re-exported here for backward compatibility. """ +import warnings from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np @@ -36,7 +37,7 @@ compute_replicate_if_variance, compute_survey_if_variance, ) -from diff_diff.utils import compute_synthetic_weights +from diff_diff.utils import _compute_noise_level, _sc_weight_fw # Constants for rank_control_units _SIMILARITY_THRESHOLD_SD = 0.5 # Controls within this many SDs are "similar" @@ -837,7 +838,10 @@ def rank_control_units( - quality_score: Combined quality score (0-1, higher is better) - outcome_trend_score: Pre-treatment outcome trend similarity - covariate_score: Covariate match score (NaN if no covariates) - - synthetic_weight: Weight from synthetic control optimization + - synthetic_weight: Informational heuristic weight from a single-pass + uncentered Frank-Wolfe solve; does NOT factor into ``quality_score`` + (ranking) and is NOT the canonical SDID unit weight. For canonical + SDID weights use ``SyntheticDiD.fit()``. - pre_trend_rmse: RMSE of pre-treatment outcome vs treated mean - is_required: Whether unit was in require_units @@ -989,8 +993,74 @@ def rank_control_units( # ------------------------------------------------------------------------- # Compute outcome trend scores # ------------------------------------------------------------------------- - # Synthetic weights (higher = better match) - synthetic_weights = compute_synthetic_weights(Y_control, Y_treated_mean, lambda_reg=lambda_reg) + # Informational `synthetic_weight` column. This is a RANKING HEURISTIC, + # not an estimator: it gives a rough "which controls would a synthetic + # regression weight heavily" signal that's reported alongside RMSE and + # covariate distance. The actual ranking (`quality_score`) is computed + # below from `outcome_trend_score` (RMSE-based) + `covariate_score`; the + # `synthetic_weight` column does NOT factor into the ranking decision. + # + # Solver choice. We use a single-pass uncentered Frank-Wolfe via the + # shared `_sc_weight_fw` dispatcher to solve: + # + # min_w ||Y_treated_mean - Y_control @ w||^2 + lambda_reg * ||w||^2 + # s.t. w >= 0, sum(w) = 1 + # + # Mapped to the FW objective `zeta^2 ||w||^2 + (1/N) ||Aw - b||^2` via + # `zeta = sqrt(lambda_reg / N)`. intercept=False because this QP does + # no column-centering, max_iter=1000 to bound ranking-loop cost, + # min_weight=1e-6 post-processing for interpretability. + # + # NOTE — this is INTENTIONALLY NOT the canonical SDID / R + # `synthdid::sc.weight.fw` two-pass unit-weight procedure (that uses + # intercept=TRUE, 100-iter -> sparsify -> 10000-iter). SDID estimation + # still uses that canonical path in `_sc_weight_fw_numpy` at + # `utils.py:_sc_weight_fw_numpy` via `compute_sdid_unit_weights`; this + # ranking heuristic uses a simpler single-pass call to the same solver + # for a cheap diagnostic score. + # + # Replaces the former `compute_synthetic_weights` wrapper whose Rust + # and Python backends had divergent PGD implementations (audit + # finding #22). Net effect: users on default `lambda_reg=0` with + # typical data see `synthetic_weight` values that agree with the old + # code to ~1e-7; extreme Y or `lambda_reg > 0` cases produce values + # that differ from the old code (which was mathematically wrong). + _Y_control = np.ascontiguousarray(Y_control, dtype=np.float64) + _Y_treated_mean = np.ascontiguousarray(Y_treated_mean, dtype=np.float64) + _n_pre, _n_control = _Y_control.shape + if _n_control == 0: + synthetic_weights = np.array([], dtype=np.float64) + elif _n_control == 1: + synthetic_weights = np.array([1.0]) + else: + _zeta = float(np.sqrt(lambda_reg / _n_pre)) if lambda_reg > 0 else 0.0 + # Scale stopping threshold by noise level so convergence stays + # meaningful at any data magnitude. + _sigma = _compute_noise_level(_Y_control) + _min_decrease = 1e-5 * max(_sigma, 1e-12) + _Y_fw = np.column_stack([_Y_control, _Y_treated_mean]) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=r".*did not converge.*", + category=UserWarning, + ) + synthetic_weights = _sc_weight_fw( + _Y_fw, + zeta=_zeta, + intercept=False, + min_decrease=_min_decrease, + max_iter=1000, + ) + # Set small weights to zero for interpretability, then renormalize. + synthetic_weights = np.asarray(synthetic_weights, dtype=np.float64) + _min_weight = 1e-6 + synthetic_weights[synthetic_weights < _min_weight] = 0.0 + _total = float(np.sum(synthetic_weights)) + if _total > 0: + synthetic_weights = synthetic_weights / _total + else: + synthetic_weights = np.ones(_n_control) / _n_control # RMSE for each control vs treated mean (use nanmean to handle missing data) rmse_scores = [] diff --git a/diff_diff/utils.py b/diff_diff/utils.py index 47df08e1..bff19e97 100644 --- a/diff_diff/utils.py +++ b/diff_diff/utils.py @@ -17,7 +17,6 @@ from diff_diff._backend import ( HAS_RUST_BACKEND, _rust_project_simplex, - _rust_synthetic_weights, _rust_sdid_unit_weights, _rust_compute_time_weights, _rust_compute_noise_level, @@ -1131,115 +1130,14 @@ def equivalence_test_trends( } -def compute_synthetic_weights( - Y_control: np.ndarray, Y_treated: np.ndarray, lambda_reg: float = 0.0, min_weight: float = 1e-6 -) -> np.ndarray: - """ - Compute synthetic control unit weights using constrained optimization. - - Finds weights ω that minimize the squared difference between the - weighted average of control unit outcomes and the treated unit outcomes - during pre-treatment periods. - - Parameters - ---------- - Y_control : np.ndarray - Control unit outcomes matrix of shape (n_pre_periods, n_control_units). - Each column is a control unit, each row is a pre-treatment period. - Y_treated : np.ndarray - Treated unit mean outcomes of shape (n_pre_periods,). - Average across treated units for each pre-treatment period. - lambda_reg : float, default=0.0 - L2 regularization parameter. Larger values shrink weights toward - uniform (1/n_control). Helps prevent overfitting when n_pre < n_control. - min_weight : float, default=1e-6 - Minimum weight threshold. Weights below this are set to zero. - - Returns - ------- - np.ndarray - Unit weights of shape (n_control_units,) that sum to 1. - - Notes - ----- - Solves the quadratic program: - - min_ω ||Y_treated - Y_control @ ω||² + λ||ω - 1/n||² - s.t. ω >= 0, sum(ω) = 1 - - Uses a simplified coordinate descent approach with projection onto simplex. - """ - n_pre, n_control = Y_control.shape - - if n_control == 0: - return np.asarray([]) - - if n_control == 1: - return np.asarray([1.0]) - - # Use Rust backend if available - if HAS_RUST_BACKEND: - Y_control = np.ascontiguousarray(Y_control, dtype=np.float64) - Y_treated = np.ascontiguousarray(Y_treated, dtype=np.float64) - weights = _rust_synthetic_weights( - Y_control, Y_treated, lambda_reg, _OPTIMIZATION_MAX_ITER, _OPTIMIZATION_TOL - ) - else: - # Fallback to NumPy implementation - weights = _compute_synthetic_weights_numpy(Y_control, Y_treated, lambda_reg) - - # Set small weights to zero for interpretability - weights[weights < min_weight] = 0 - if np.sum(weights) > 0: - weights = weights / np.sum(weights) - else: - # Fallback to uniform if all weights are zeroed - weights = np.ones(n_control) / n_control - - return np.asarray(weights) - - -def _compute_synthetic_weights_numpy( - Y_control: np.ndarray, - Y_treated: np.ndarray, - lambda_reg: float = 0.0, -) -> np.ndarray: - """NumPy fallback implementation of compute_synthetic_weights.""" - n_pre, n_control = Y_control.shape - - # Initialize with uniform weights - weights = np.ones(n_control) / n_control - - # Precompute matrices for optimization - # Objective: ||Y_treated - Y_control @ w||^2 + lambda * ||w - w_uniform||^2 - # = w' @ (Y_control' @ Y_control + lambda * I) @ w - 2 * (Y_control' @ Y_treated + lambda * w_uniform)' @ w + const - YtY = Y_control.T @ Y_control - YtT = Y_control.T @ Y_treated - w_uniform = np.ones(n_control) / n_control - - # Add regularization - H = YtY + lambda_reg * np.eye(n_control) - f = YtT + lambda_reg * w_uniform - - # Solve with projected gradient descent - # Project onto probability simplex - step_size = 1.0 / (np.linalg.norm(H, 2) + _NUMERICAL_EPS) - - for _ in range(_OPTIMIZATION_MAX_ITER): - weights_old = weights.copy() - - # Gradient step: minimize ||Y - Y_control @ w||^2 - grad = H @ weights - f - weights = weights - step_size * grad - - # Project onto simplex (sum to 1, non-negative) - weights = _project_simplex(weights) - - # Check convergence - if np.linalg.norm(weights - weights_old) < _OPTIMIZATION_TOL: - break - - return weights +# compute_synthetic_weights and _compute_synthetic_weights_numpy removed in the +# silent-failures audit post-cleanup (finding #22). The one caller +# (`diff_diff.prep.rank_control_units`) inlines a single-pass, uncentered +# Frank-Wolfe via the shared `_sc_weight_fw` dispatcher — a ranking heuristic, +# NOT the canonical SDID/R `synthdid::sc.weight.fw` two-pass procedure +# (intercept=True, 100-iter -> sparsify -> 10000-iter). Canonical SDID unit +# weights go through `compute_sdid_unit_weights` (see `_sc_weight_fw_numpy` +# below and REGISTRY.md SDID section). def _project_simplex(v: np.ndarray) -> np.ndarray: diff --git a/tests/test_estimators.py b/tests/test_estimators.py index 62440f7d..77382274 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -3124,25 +3124,9 @@ def test_project_simplex(self): assert abs(np.sum(projected) - 1.0) < 1e-6 assert np.all(projected >= 0) - def test_compute_synthetic_weights(self): - """Test synthetic weight computation.""" - from diff_diff.utils import compute_synthetic_weights - - np.random.seed(42) - n_pre = 5 - n_control = 10 - - Y_control = np.random.randn(n_pre, n_control) - Y_treated = np.random.randn(n_pre) - - weights = compute_synthetic_weights(Y_control, Y_treated) - - # Weights should sum to 1 - assert abs(np.sum(weights) - 1.0) < 1e-6 - # Weights should be non-negative - assert np.all(weights >= 0) - # Should have correct length - assert len(weights) == n_control + # test_compute_synthetic_weights removed in the silent-failures audit + # post-cleanup (finding #22). Helper deleted; behavior now covered via + # tests/test_prep.py::TestRankControlUnits (its sole caller). def test_compute_time_weights(self): """Test time weight computation with Frank-Wolfe solver.""" diff --git a/tests/test_prep.py b/tests/test_prep.py index a9818e4a..0e4bf5d3 100644 --- a/tests/test_prep.py +++ b/tests/test_prep.py @@ -766,6 +766,126 @@ def test_single_control_unit(self): # Single control should get score of 1.0 (best possible) assert result["quality_score"].iloc[0] == 1.0 + def test_extreme_Y_scale_synthetic_weight_column(self): + """Finding #22 (post-audit cleanup): `synthetic_weight` column must + remain a valid non-degenerate simplex vector even at extreme Y + scale (Y ~ 1e9). The previous `compute_synthetic_weights` wrapper + had two bugs here: Rust PGD collapsed to a single vertex, Python + PGD stalled at uniform. The inlined Frank-Wolfe solver in + ``rank_control_units`` handles both cases correctly.""" + from diff_diff.prep import rank_control_units + + data = generate_did_data(n_units=12, n_periods=8, seed=42) + # Shift outcomes to extreme scale — the exact condition the deleted + # wrapper mishandled. + data = data.copy() + data["outcome"] = data["outcome"] + 1e9 + + result = rank_control_units( + data, + unit_column="unit", + time_column="period", + outcome_column="outcome", + treatment_column="treated", + ) + + weights = result["synthetic_weight"].to_numpy() + # Valid simplex: non-negative, sums to 1. + assert np.all(weights >= 0), "synthetic_weight must be non-negative" + assert abs(weights.sum() - 1.0) < 1e-10, ( + f"synthetic_weight should sum to 1.0, got {weights.sum()}" + ) + # Non-degenerate: at least 2 controls receive non-trivial weight. + # This guards the Rust-PGD collapse-to-one-vertex bug that + # previously fired at Y ~ 1e9 under the deleted wrapper. + assert int(np.sum(weights > 1e-6)) >= 2, ( + f"synthetic_weight collapsed to a single vertex at extreme Y " + f"scale; n_nonzero={int(np.sum(weights > 1e-6))}. weights={weights}" + ) + + def test_synthetic_weight_column_with_lambda_reg(self): + """`rank_control_units(lambda_reg > 0)` regression. Non-zero + regularization must produce a valid simplex vector and should pull + weights toward a more uniform distribution vs `lambda_reg=0`.""" + from diff_diff.prep import rank_control_units + + data = generate_did_data(n_units=10, n_periods=8, seed=42) + + res_unreg = rank_control_units( + data, + unit_column="unit", + time_column="period", + outcome_column="outcome", + treatment_column="treated", + lambda_reg=0.0, + ) + res_reg = rank_control_units( + data, + unit_column="unit", + time_column="period", + outcome_column="outcome", + treatment_column="treated", + lambda_reg=1.0, + ) + + w_unreg = res_unreg["synthetic_weight"].to_numpy() + w_reg = res_reg["synthetic_weight"].to_numpy() + + for w, label in [(w_unreg, "unregularized"), (w_reg, "regularized")]: + assert np.all(w >= 0), f"{label} weights must be non-negative" + assert abs(w.sum() - 1.0) < 1e-10, ( + f"{label} weights should sum to 1.0, got {w.sum()}" + ) + + # Regularization should increase entropy (pull toward uniform) or at + # least not collapse the simplex — a valid regularized solution must + # put non-trivial mass on at least as many donors as the unregularized + # one for a well-conditioned input. + assert int(np.sum(w_reg > 1e-6)) >= int(np.sum(w_unreg > 1e-6)) - 1, ( + f"lambda_reg=1.0 should not collapse support vs lambda_reg=0; " + f"n_nonzero_reg={int(np.sum(w_reg > 1e-6))}, " + f"n_nonzero_unreg={int(np.sum(w_unreg > 1e-6))}" + ) + + def test_synthetic_weight_column_backend_parity(self): + """Full-pipeline Rust/Python parity for `rank_control_units`. Forces + the Python FW path via `HAS_RUST_BACKEND=False` and verifies the + resulting `synthetic_weight` column agrees with the default (Rust + when available) path to a loose tolerance. Backend divergence on + this caller path was the trigger for deleting the old wrapper.""" + from unittest.mock import patch + + from diff_diff import utils as utils_mod + from diff_diff.prep import rank_control_units + + data = generate_did_data(n_units=10, n_periods=8, seed=123) + + kwargs = dict( + unit_column="unit", + time_column="period", + outcome_column="outcome", + treatment_column="treated", + ) + + res_default = rank_control_units(data, **kwargs) + with patch.object(utils_mod, "HAS_RUST_BACKEND", False): + res_python = rank_control_units(data, **kwargs) + + # Align by unit id (ranking order is not a backend-parity property; + # the simplex values on common donors are). + merged = res_default[["unit", "synthetic_weight"]].merge( + res_python[["unit", "synthetic_weight"]], + on="unit", + suffixes=("_default", "_python"), + ) + # FW stopping threshold is scale-dependent, so use a loose tolerance. + np.testing.assert_allclose( + merged["synthetic_weight_default"].to_numpy(), + merged["synthetic_weight_python"].to_numpy(), + atol=1e-4, + rtol=1e-4, + ) + class TestGenerateStaggeredData: """Tests for generate_staggered_data function.""" diff --git a/tests/test_rust_backend.py b/tests/test_rust_backend.py index 25ae10ef..6a35fc54 100644 --- a/tests/test_rust_backend.py +++ b/tests/test_rust_backend.py @@ -104,39 +104,10 @@ def test_bootstrap_different_seeds(self): # Synthetic Weight Tests # ========================================================================= - def test_synthetic_weights_sum_to_one(self): - """Test synthetic weights sum to 1.""" - from diff_diff._rust_backend import compute_synthetic_weights - - np.random.seed(42) - Y_control = np.random.randn(10, 5) - Y_treated = np.random.randn(10) - - weights = compute_synthetic_weights(Y_control, Y_treated, 0.0, 1000, 1e-8) - assert abs(weights.sum() - 1.0) < 1e-6, f"Weights should sum to 1, got {weights.sum()}" - - def test_synthetic_weights_non_negative(self): - """Test synthetic weights are non-negative.""" - from diff_diff._rust_backend import compute_synthetic_weights - - np.random.seed(42) - Y_control = np.random.randn(10, 5) - Y_treated = np.random.randn(10) - - weights = compute_synthetic_weights(Y_control, Y_treated, 0.0, 1000, 1e-8) - assert np.all(weights >= -1e-10), "Weights should be non-negative" - - def test_synthetic_weights_shape(self): - """Test synthetic weights have correct shape.""" - from diff_diff._rust_backend import compute_synthetic_weights - - np.random.seed(42) - n_control = 8 - Y_control = np.random.randn(10, n_control) - Y_treated = np.random.randn(10) - - weights = compute_synthetic_weights(Y_control, Y_treated, 0.0, 1000, 1e-8) - assert weights.shape == (n_control,) + # Tests for `compute_synthetic_weights` direct Rust binding removed in + # the silent-failures audit post-cleanup (finding #22). The helper was + # deleted from the Python layer; remaining Rust symbol is dead code + # (tracked as a low-priority Rust cleanup TODO). # ========================================================================= # Simplex Projection Tests @@ -729,52 +700,10 @@ def test_bootstrap_weights_webb_properties(self): # Synthetic Weights Equivalence # ========================================================================= - def test_synthetic_weights_match(self): - """Test Rust and NumPy synthetic weights produce similar results.""" - from diff_diff._rust_backend import compute_synthetic_weights as rust_fn - from diff_diff.utils import _compute_synthetic_weights_numpy as numpy_fn - - np.random.seed(42) - Y_control = np.random.randn(10, 5) - Y_treated = np.random.randn(10) - - rust_weights = rust_fn(Y_control, Y_treated, 0.0, 1000, 1e-8) - numpy_weights = numpy_fn(Y_control, Y_treated, 0.0) - - # Both should be valid simplex weights - assert abs(rust_weights.sum() - 1.0) < 1e-6, "Rust weights should sum to 1" - assert abs(numpy_weights.sum() - 1.0) < 1e-6, "NumPy weights should sum to 1" - assert np.all(rust_weights >= -1e-6), "Rust weights should be non-negative" - assert np.all(numpy_weights >= -1e-6), "NumPy weights should be non-negative" - - # Reconstruction error should be similar - rust_error = np.linalg.norm(Y_treated - Y_control @ rust_weights) - numpy_error = np.linalg.norm(Y_treated - Y_control @ numpy_weights) - assert abs(rust_error - numpy_error) < 0.5, \ - f"Reconstruction errors should be similar: rust={rust_error:.4f}, numpy={numpy_error:.4f}" - - def test_synthetic_weights_with_regularization(self): - """Test Rust synthetic weights with L2 regularization.""" - from diff_diff._rust_backend import compute_synthetic_weights as rust_fn - from diff_diff.utils import _compute_synthetic_weights_numpy as numpy_fn - - np.random.seed(42) - Y_control = np.random.randn(15, 8) - Y_treated = np.random.randn(15) - lambda_reg = 0.1 - - rust_weights = rust_fn(Y_control, Y_treated, lambda_reg, 1000, 1e-8) - numpy_weights = numpy_fn(Y_control, Y_treated, lambda_reg) - - # Both should be valid simplex weights - assert abs(rust_weights.sum() - 1.0) < 1e-6 - assert abs(numpy_weights.sum() - 1.0) < 1e-6 - - # With regularization, weights should be more spread out (higher entropy) - rust_entropy = -np.sum(rust_weights * np.log(rust_weights + 1e-10)) - numpy_entropy = -np.sum(numpy_weights * np.log(numpy_weights + 1e-10)) - assert rust_entropy > 0.5, "Regularized weights should have positive entropy" - assert numpy_entropy > 0.5, "Regularized weights should have positive entropy" + # Rust/NumPy synthetic_weights parity tests removed in the silent-failures + # audit post-cleanup (finding #22). Helper deleted; parity is now a + # non-question since both paths route through the shared `_sc_weight_fw` + # dispatcher in `utils.py`. def test_simplex_projection_match(self): """Test Rust and NumPy simplex projection match exactly.""" @@ -2214,104 +2143,10 @@ def test_rank_deficient_collinear(self): ) -@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") -class TestSyntheticWeightsBackendParity: - """Finding #22: `compute_synthetic_weights` Rust vs Python parity. - - Both backends solve the same constrained QP (FW + simplex projection) - with ``_OPTIMIZATION_TOL=1e-8``. PR #312 normalized Y inside the SDID - fit path; the shared utility at ``utils.py:1134-1199`` is unchanged - and is called from ``prep.py:990`` (DGP) only. - - Tolerance: ``atol=1e-7, rtol=1e-6`` on weights — loose enough to - absorb the FW convergence tolerance (1e-8) plus BLAS platform drift - (~1e-12 to 1e-14 on float64), tight enough to catch real algorithmic - divergence. - """ - - def _run_both_backends(self, Y_control, Y_treated, lambda_reg=0.0): - import sys - from unittest.mock import patch - - from diff_diff.utils import compute_synthetic_weights - - w_rust = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=lambda_reg) - - utils_module = sys.modules["diff_diff.utils"] - with patch.object(utils_module, "HAS_RUST_BACKEND", False): - w_py = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=lambda_reg) - return w_rust, w_py - - def test_near_singular_Y_prime_Y(self): - """Highly collinear control units make Y'Y near-singular. Both - backends should converge to the same (near-uniform or zeroed-out) - weight vector.""" - rng = np.random.default_rng(7) - n_pre, n_control = 5, 4 - base = rng.normal(0, 1, n_pre) - # Three of four control units are nearly parallel to `base` - Y_control = np.column_stack([ - base, - base + 1e-10 * rng.normal(0, 1, n_pre), - base + 1e-10 * rng.normal(0, 1, n_pre), - rng.normal(0, 1, n_pre), - ]) - Y_treated = base + 1e-8 * rng.normal(0, 1, n_pre) - - w_rust, w_py = self._run_both_backends(Y_control, Y_treated) - - np.testing.assert_allclose( - w_rust, w_py, atol=1e-7, rtol=1e-6, - err_msg="Near-singular Y'Y: weight divergence between Rust and Python", - ) - - @pytest.mark.xfail( - strict=True, - reason="Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is " - "projected gradient descent. PGD and FW can converge to different " - "vertices under near-degenerate / extreme-scale inputs even though " - "both solve the same constrained QP. Algorithmic unification is a " - "larger fix (P1 follow-up in TODO.md); this xfail baselines the gap " - "so we notice if/when the algorithms align.", - ) - def test_extreme_Y_scale(self): - """Y at 1e9 scale, well-conditioned panel. Known Rust-vs-Python - divergence: Rust/FW concentrates on one unit, Python/PGD returns - near-uniform. See xfail reason.""" - rng = np.random.default_rng(44) - n_pre, n_control = 8, 5 - Y_control = rng.normal(0, 1, (n_pre, n_control)) + 1e9 - Y_treated = rng.normal(0, 1, n_pre) + 1e9 - - w_rust, w_py = self._run_both_backends(Y_control, Y_treated) - - np.testing.assert_allclose( - w_rust, w_py, atol=1e-7, rtol=1e-6, - err_msg="Extreme Y scale: weight divergence " - "(known FW-vs-PGD algorithmic mismatch; see xfail reason)", - ) - - @pytest.mark.xfail( - strict=True, - reason="Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is " - "projected gradient descent. Same algorithmic mismatch as " - "test_extreme_Y_scale — backends reach different simplex vertices " - "on moderate-scale random data. P1 follow-up in TODO.md.", - ) - def test_regularization_lambda_variations(self): - """Parity across lambda_reg=0 (unregularized) and lambda_reg>0 - (shrink toward uniform). Known divergence; see xfail reason.""" - rng = np.random.default_rng(55) - n_pre, n_control = 8, 5 - Y_control = rng.normal(0, 1, (n_pre, n_control)) - Y_treated = rng.normal(0, 1, n_pre) - - for lr in (0.0, 0.01, 1.0): - w_rust, w_py = self._run_both_backends(Y_control, Y_treated, lambda_reg=lr) - np.testing.assert_allclose( - w_rust, w_py, atol=1e-7, rtol=1e-6, - err_msg=f"lambda_reg={lr}: weight divergence between Rust and Python", - ) +# TestSyntheticWeightsBackendParity removed in the silent-failures audit +# post-cleanup (finding #22). The wrapper it tested was deleted; the FW +# computation is inlined in `rank_control_units` (prep.py:990) and covered +# there by tests/test_prep.py::TestRankControlUnits. @pytest.mark.slow diff --git a/tests/test_utils.py b/tests/test_utils.py index 61f2d353..9d605955 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -25,7 +25,6 @@ compute_p_value, compute_robust_se, compute_sdid_estimator, - compute_synthetic_weights, compute_time_weights, equivalence_test_trends, safe_inference, @@ -1125,59 +1124,9 @@ def test_tighter_margin_harder_to_pass(self, parallel_trends_data): assert results_wide["tost_p_value"] <= results_tight["tost_p_value"] -# ============================================================================= -# Additional tests for compute_synthetic_weights -# ============================================================================= - - -class TestComputeSyntheticWeightsEdgeCases: - """Edge case tests for compute_synthetic_weights.""" - - def test_empty_control_matrix(self): - """Test with empty control matrix.""" - Y_control = np.zeros((5, 0)) - Y_treated = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) - - weights = compute_synthetic_weights(Y_control, Y_treated) - - assert len(weights) == 0 - - def test_single_control_unit(self): - """Test with single control unit.""" - Y_control = np.array([[1.0], [2.0], [3.0]]) - Y_treated = np.array([1.0, 2.0, 3.0]) - - weights = compute_synthetic_weights(Y_control, Y_treated) - - assert len(weights) == 1 - assert abs(weights[0] - 1.0) < 1e-6 - - def test_regularization_effect(self): - """Test that regularization affects weight sparsity.""" - np.random.seed(42) - Y_control = np.random.randn(10, 5) - Y_treated = np.random.randn(10) - - weights_no_reg = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=0.0) - weights_high_reg = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=10.0) - - # High regularization should give more uniform weights - var_no_reg = np.var(weights_no_reg) - var_high_reg = np.var(weights_high_reg) - - assert var_high_reg < var_no_reg + 0.01 - - def test_min_weight_threshold(self): - """Test that small weights are zeroed out.""" - np.random.seed(42) - Y_control = np.random.randn(10, 5) - Y_treated = np.random.randn(10) - - weights = compute_synthetic_weights(Y_control, Y_treated, min_weight=0.01) - - # All non-zero weights should be >= min_weight - non_zero_weights = weights[weights > 0] - assert np.all(non_zero_weights >= 0.01) +# Removed TestComputeSyntheticWeightsEdgeCases in the silent-failures audit +# post-cleanup (finding #22). The `compute_synthetic_weights` helper was deleted; +# its behavior is now exercised via `rank_control_units` in test_prep.py. # =============================================================================