diff --git a/.github/workflows/performance-parity.yml b/.github/workflows/performance-parity.yml index 6e59c4b0..53b08358 100644 --- a/.github/workflows/performance-parity.yml +++ b/.github/workflows/performance-parity.yml @@ -59,10 +59,11 @@ jobs: --python-report output/performance/python_performance_report.json \ --matlab-report tests/performance/fixtures/matlab/performance_baseline_470fde8.json \ --policy parity/performance_gate_policy.yml \ - --previous-python-report tests/performance/fixtures/python/performance_baseline_20260303.json \ + --previous-python-report tests/performance/fixtures/python/performance_baseline_linux_20260304.json \ --report-out output/performance/performance_parity_report.json \ --csv-out output/performance/performance_parity_report.csv \ - --fail-on-regression + --fail-on-regression \ + --require-regression-env-match - name: Run pytest-benchmark smoke suite env: @@ -80,5 +81,5 @@ jobs: output/performance/*.json output/performance/*.csv tests/performance/fixtures/matlab/performance_baseline_470fde8.json - tests/performance/fixtures/python/performance_baseline_20260303.json + tests/performance/fixtures/python/performance_baseline_linux_20260304.json if-no-files-found: warn diff --git a/README.md b/README.md index 29f312f5..560a63de 100644 --- a/README.md +++ b/README.md @@ -187,10 +187,11 @@ python tools/performance/compare_matlab_python_performance.py \ --python-report output/performance/python_performance_report.json \ --matlab-report tests/performance/fixtures/matlab/performance_baseline_470fde8.json \ --policy parity/performance_gate_policy.yml \ - --previous-python-report tests/performance/fixtures/python/performance_baseline_20260303.json \ + --previous-python-report tests/performance/fixtures/python/performance_baseline_linux_20260304.json \ --report-out parity/performance_parity_report.json \ --csv-out parity/performance_parity_report.csv \ - --fail-on-regression + --fail-on-regression \ + --require-regression-env-match ``` Generate MATLAB baseline report (controlled environment): diff --git a/parity/CYCLE_VALIDATION_CHECKLIST.md b/parity/CYCLE_VALIDATION_CHECKLIST.md new file mode 100644 index 00000000..a3588b45 --- /dev/null +++ b/parity/CYCLE_VALIDATION_CHECKLIST.md @@ -0,0 +1,47 @@ +# Cycle Validation Checklist (2026-03-04) + +Commands used each cycle: +- `pytest -q` +- `python tools/parity/build_numeric_drift_report.py --fixtures-manifest tests/parity/fixtures/matlab_gold/manifest.yml --thresholds parity/numeric_drift_thresholds.yml --report-out parity/numeric_drift_report.json --fail-on-violation` +- `python tools/parity/check_functional_parity_progress.py --report parity/function_example_alignment_report.json --policy parity/functional_gate_policy.yml` +- `python tools/parity/check_example_output_spec.py --report parity/function_example_alignment_report.json --spec parity/example_output_spec.yml` +- `python tools/reports/generate_validation_pdf.py --repo-root "$PWD" --matlab-help-root /Users/iahncajigas/Library/CloudStorage/Dropbox/Research/Matlab/nSTAT_currentRelease_Local/helpfiles --notebook-group all --timeout 900 --skip-command-tests --parity-mode gate --enforce-unique-images --min-unique-images-per-topic 1 --max-cross-topic-reuse-ratio 1.0` +- `python tools/reports/generate_validation_pdf.py --repo-root "$PWD" --matlab-help-root /Users/iahncajigas/Library/CloudStorage/Dropbox/Research/Matlab/nSTAT_currentRelease_Local/helpfiles --notebook-group all --timeout 900 --skip-command-tests --parity-mode image --skip-parity-check` +- `python tools/reports/build_image_parity_pdfs.py --report-json --python-out output/pdf/image_mode_parity/python_pages.pdf --matlab-out output/pdf/image_mode_parity/matlab_pages.pdf --pairs-json output/pdf/image_mode_parity/pairs.json` +- `python tools/reports/check_pdf_image_parity.py --python-pdf output/pdf/image_mode_parity/python_pages.pdf --matlab-pdf output/pdf/image_mode_parity/matlab_pages.pdf --out-dir output/pdf/image_mode_parity --dpi 150 --ssim-threshold 0.70 --max-failing-pages 0` +- `python tools/performance/run_python_benchmarks.py --tiers S --repeats 5 --warmup 1 --out-json output/performance/python_performance_report.json --out-csv output/performance/python_performance_report.csv` +- `python tools/performance/compare_matlab_python_performance.py --python-report output/performance/python_performance_report.json --matlab-report tests/performance/fixtures/matlab/performance_baseline_470fde8.json --policy parity/performance_gate_policy.yml --previous-python-report tests/performance/fixtures/python/performance_baseline_linux_20260304.json --report-out output/performance/performance_parity_report.json --csv-out output/performance/performance_parity_report.csv --fail-on-regression --require-regression-env-match` +- Local macOS reruns use `tests/performance/fixtures/python/performance_baseline_20260303.json` with the same command to satisfy strict env matching. + +## Cycle 1 +- Log: `output/cycle/cycle1.log` +- `pytest`: PASS +- numeric drift (0 failed topics): PASS +- functional parity (no gaps/partials): PASS +- example output spec: PASS +- gate-mode validation PDF (0 parity failures, 0 uniqueness violations): PASS +- image-mode parity (0 failing pages): PASS +- performance-parity (0 regression failures): PASS +- Fixes applied in cycle: comparator option to require regression env match + regression test coverage. + +## Cycle 2 +- Log: `output/cycle/cycle2.log` +- `pytest`: PASS +- numeric drift (0 failed topics): PASS +- functional parity (no gaps/partials): PASS +- example output spec: PASS +- gate-mode validation PDF (0 parity failures, 0 uniqueness violations): PASS +- image-mode parity (0 failing pages): PASS +- performance-parity (0 regression failures): PASS +- Fixes applied in cycle: Linux baseline + strict regression env matching in workflow/tests, decoding `computeSpikeRateCIs` vectorization, and added deterministic performance workloads for `nspikeTrain.getSigRep` and `Analysis.fitGLM`. + +## Cycle 3 +- Log: `output/cycle/cycle3.log` +- `pytest`: PASS +- numeric drift (0 failed topics): PASS +- functional parity (no gaps/partials): PASS +- example output spec: PASS +- gate-mode validation PDF (0 parity failures, 0 uniqueness violations): PASS +- image-mode parity (0 failing pages): PASS +- performance-parity (0 regression failures): PASS +- Fixes applied in cycle: none required; full acceptance suite rerun clean after Cycle 2 changes. diff --git a/src/nstat/compat/matlab/__init__.py b/src/nstat/compat/matlab/__init__.py index db7745f1..09829dfb 100644 --- a/src/nstat/compat/matlab/__init__.py +++ b/src/nstat/compat/matlab/__init__.py @@ -3383,36 +3383,37 @@ def _compute_spike_rate_cis_matlab( chol_m = DecodingAlgorithms._chol_like_matlab(Wku_temp) if chol_m.shape != (K, K): raise ValueError("Wku covariance slice must be KxK") - for c in range(int(Mc)): - z = rng.normal(0.0, 1.0, size=(K,)) - xK_draw[r, :, c] = xK_arr[r, :] + (chol_m @ z) + # Preserve MATLAB-parity draw ordering by sampling (Mc, K), where each row is one Monte-Carlo draw. + z_draw = rng.normal(0.0, 1.0, size=(int(Mc), K)) + xK_draw[r, :, :] = xK_arr[r, :][:, None] + (chol_m @ z_draw.T) - lambda_delta = np.zeros((dN_arr.shape[1], K, int(Mc)), dtype=float) spike_rate = np.zeros((int(Mc), K), dtype=float) - for c in range(int(Mc)): + mask = (time >= float(t0)) & (time <= float(tf)) + interval = max(float(tf - t0), np.finfo(float).eps) + integrate_fn = getattr(np, "trapezoid", None) + if integrate_fn is None: + integrate_fn = getattr(np, "trapz", None) # pragma: no cover - NumPy<2 fallback + + if window_vals.size > 0 and np.any(np.abs(gamma_vec) > 0.0): + hist_term = np.zeros((dN_arr.shape[1], K), dtype=float) for k in range(K): - stim_k = basis_mat @ xK_draw[:, k, c] - if window_vals.size > 0 and np.any(np.abs(gamma_vec) > 0.0): - hk = Hk[k] - cols = min(hk.shape[1], gamma_vec.size) - hist_lin = hk[:, :cols] @ gamma_vec[:cols] - else: - hist_lin = np.zeros(stim_k.shape[0], dtype=float) - eta = stim_k + hist_lin - if fit_type == "poisson": - lam = np.exp(eta) - else: - exp_eta = np.exp(eta) - lam = exp_eta / (1.0 + exp_eta) - lambda_delta[:, k, c] = lam - rates = lambda_delta[:, :, c] / float(delta) - mask = (time >= float(t0)) & (time <= float(tf)) + hk = Hk[k] + cols = min(hk.shape[1], gamma_vec.size) + hist_term[:, k] = hk[:, :cols] @ gamma_vec[:cols] + else: + hist_term = np.zeros((dN_arr.shape[1], K), dtype=float) + + for c in range(int(Mc)): + stim_ck = basis_mat @ xK_draw[:, :, c] + eta = stim_ck + hist_term + if fit_type == "poisson": + rates = np.exp(eta) / float(delta) + else: + exp_eta = np.exp(eta) + rates = (exp_eta / (1.0 + exp_eta)) / float(delta) if np.sum(mask) < 2: integral_vals = np.zeros(K, dtype=float) else: - integrate_fn = getattr(np, "trapezoid", None) - if integrate_fn is None: - integrate_fn = getattr(np, "trapz", None) # pragma: no cover - NumPy<2 fallback if integrate_fn is None: # pragma: no cover - extreme fallback dt_vec = np.diff(time[mask]).reshape(-1, 1) y0 = rates[mask, :][:-1, :] @@ -3423,7 +3424,7 @@ def _compute_spike_rate_cis_matlab( integrate_fn(rates[mask, :], x=time[mask], axis=0), dtype=float, ) - spike_rate[c, :] = integral_vals / max(float(tf - t0), np.finfo(float).eps) + spike_rate[c, :] = integral_vals / interval CIs = np.zeros((K, 2), dtype=float) for k in range(K): @@ -3451,10 +3452,9 @@ def _compute_spike_rate_cis_matlab( ci_obj.setColor("b") spike_rate_sig.setConfInterval(ci_obj) - prob_mat = np.zeros((K, K), dtype=float) - for k in range(K): - for m in range(k + 1, K): - prob_mat[k, m] = float(np.sum(spike_rate[:, m] > spike_rate[:, k])) / float(Mc) + # prob_mat(k,m) = P(rate_m > rate_k), with MATLAB-style upper-triangle usage. + prob_full = np.mean(spike_rate[:, None, :] > spike_rate[:, :, None], axis=0) + prob_mat = np.triu(np.asarray(prob_full, dtype=float), k=1) sig_mat = (prob_mat > (1.0 - float(alphaVal))).astype(float) return spike_rate_sig, prob_mat, sig_mat diff --git a/src/nstat/performance_workloads.py b/src/nstat/performance_workloads.py index 8e744981..7493ee93 100644 --- a/src/nstat/performance_workloads.py +++ b/src/nstat/performance_workloads.py @@ -7,7 +7,7 @@ import numpy as np -from nstat.compat.matlab import CIF, Covariate, DecodingAlgorithms, History, nstColl +from nstat.compat.matlab import Analysis, CIF, Covariate, DecodingAlgorithms, History, nspikeTrain, nstColl TIER_ORDER = ("S", "M", "L") @@ -17,6 +17,8 @@ "history_design_matrix", "simulate_cif_thinning", "decoding_spike_rate_cis", + "nspiketrain_get_sigrep", + "analysis_fit_glm_pipeline", ) @@ -36,6 +38,10 @@ class CaseConfig: n_bins: int = 120 mc_draws: int = 30 decode_delta_s: float = 0.01 + sigrep_bin_s: float = 0.001 + glm_n_samples: int = 1000 + glm_n_features: int = 6 + glm_dt_s: float = 0.001 def get_case_config(case: str, tier: str) -> CaseConfig: @@ -74,6 +80,18 @@ def get_case_config(case: str, tier: str) -> CaseConfig: "M": dict(num_basis=6, num_trials=8, n_bins=200, mc_draws=50, decode_delta_s=0.01), "L": dict(num_basis=8, num_trials=12, n_bins=320, mc_draws=80, decode_delta_s=0.01), } + elif case == "nspiketrain_get_sigrep": + vals = { + "S": dict(n_spikes=800, duration_s=2.0, sigrep_bin_s=0.002), + "M": dict(n_spikes=3000, duration_s=3.0, sigrep_bin_s=0.001), + "L": dict(n_spikes=9000, duration_s=5.0, sigrep_bin_s=0.001), + } + elif case == "analysis_fit_glm_pipeline": + vals = { + "S": dict(glm_n_samples=900, glm_n_features=6, glm_dt_s=0.001), + "M": dict(glm_n_samples=1800, glm_n_features=8, glm_dt_s=0.001), + "L": dict(glm_n_samples=3200, glm_n_features=10, glm_dt_s=0.001), + } else: raise ValueError(f"Unknown case: {case}") @@ -101,6 +119,23 @@ def _deterministic_decode_inputs(cfg: CaseConfig) -> tuple[np.ndarray, np.ndarra return xk, wku, d_n +def _deterministic_glm_inputs(cfg: CaseConfig) -> tuple[np.ndarray, np.ndarray]: + n = int(cfg.glm_n_samples) + p = int(cfg.glm_n_features) + t = np.linspace(0.0, 1.0, n, dtype=float) + X = np.zeros((n, p), dtype=float) + for j in range(p): + f = float(j + 1) + X[:, j] = np.sin(2.0 * np.pi * f * t) + 0.35 * np.cos(2.0 * np.pi * (f + 0.5) * t) + + beta = np.linspace(-0.25, 0.30, p, dtype=float) + eta = -2.0 + X @ beta + mu = np.exp(np.clip(eta, -25.0, 25.0)) * float(cfg.glm_dt_s) + phase = np.sin(np.arange(n, dtype=float) * 0.071) + 1.0 + y = np.floor(mu + 0.35 * phase).astype(float) + return X, y + + def run_python_workload(case: str, tier: str, seed: int = 20260303) -> dict[str, float]: """Execute one deterministic Python workload and return summary metrics.""" @@ -183,4 +218,26 @@ def run_python_workload(case: str, tier: str, seed: int = 20260303) -> dict[str, "rate_mean": float(np.mean(rate)), } + if case == "nspiketrain_get_sigrep": + spikes = _deterministic_spike_times(cfg.n_spikes, cfg.duration_s) + train = nspikeTrain(spikes, t_start=0.0, t_end=float(cfg.duration_s), name="perf_unit") + sig_binary = np.asarray(train.getSigRep(binSize_s=cfg.sigrep_bin_s, mode="binary"), dtype=float) + sig_count = np.asarray(train.getSigRep(binSize_s=cfg.sigrep_bin_s, mode="count"), dtype=float) + return { + "n_bins": float(sig_binary.size), + "binary_sum": float(np.sum(sig_binary)), + "count_sum": float(np.sum(sig_count)), + } + + if case == "analysis_fit_glm_pipeline": + X, y = _deterministic_glm_inputs(cfg) + fit = Analysis.fitGLM(X=X, y=y, fitType="poisson", dt=float(cfg.glm_dt_s)) + pred = np.asarray(fit.predict(X), dtype=float) + return { + "coeff_norm": float(np.linalg.norm(fit.coefficients)), + "intercept": float(fit.intercept), + "log_likelihood": float(fit.log_likelihood), + "pred_mean": float(np.mean(pred)), + } + raise ValueError(f"Unhandled workload case: {case}") diff --git a/tests/performance/fixtures/python/performance_baseline_linux_20260304.csv b/tests/performance/fixtures/python/performance_baseline_linux_20260304.csv new file mode 100644 index 00000000..fdd75e7b --- /dev/null +++ b/tests/performance/fixtures/python/performance_baseline_linux_20260304.csv @@ -0,0 +1,16 @@ +case,tier,repeats,median_runtime_ms,mean_runtime_ms,std_runtime_ms,median_peak_memory_mb,summary +unit_impulse_basis,S,7,1.7549330000008467,1.767814000002334,0.03712841168356124,0.39142704010009766,"{""cols"": 50.0, ""rows"": 501.0, ""total_mass"": 500.0}" +unit_impulse_basis,M,7,5.230034999996747,5.244222714285992,0.04359156024491504,3.076793670654297,"{""cols"": 100.0, ""rows"": 2001.0, ""total_mass"": 2000.0}" +unit_impulse_basis,L,7,10.598025999996707,10.61451614285959,0.03639945260812847,18.373775482177734,"{""cols"": 200.0, ""rows"": 6001.0, ""total_mass"": 6000.0}" +covariate_resample,S,7,0.3187809999900537,0.31151314285742565,0.027315777992366848,0.06198883056640625,"{""cols"": 1.0, ""rows"": 1001.0, ""signal_energy"": 0.5195204795204795}" +covariate_resample,M,7,0.3733120000077861,0.3833688571432958,0.027679721024068783,0.1355915069580078,"{""cols"": 1.0, ""rows"": 3001.0, ""signal_energy"": 0.5198042747802833}" +covariate_resample,L,7,0.4388640000030364,0.43310742857321266,0.016053381349413916,0.2376575469970703,"{""cols"": 1.0, ""rows"": 6001.0, ""signal_energy"": 0.5199200133311115}" +history_design_matrix,S,7,0.34009099999821046,0.3473752857193598,0.02510644529088935,0.0890655517578125,"{""cols"": 4.0, ""rows"": 1000.0, ""total_count"": 9737.0}" +history_design_matrix,M,7,0.7935309999993478,0.8029685714266829,0.02089668436143026,0.43688201904296875,"{""cols"": 4.0, ""rows"": 5000.0, ""total_count"": 243740.0}" +history_design_matrix,L,7,1.6487759999961327,1.6487702857140059,0.03152991976246771,0.8869400024414062,"{""cols"": 4.0, ""rows"": 10000.0, ""total_count"": 1462420.0}" +simulate_cif_thinning,S,7,11.90902599999788,11.841307428570401,0.2313883554103292,0.07111740112304688,"{""mean_spikes_per_unit"": 13.8, ""num_units"": 5.0, ""total_spikes"": 69.0}" +simulate_cif_thinning,M,7,46.7842029999872,46.623392285716086,0.34392725136599767,0.13444900512695312,"{""mean_spikes_per_unit"": 23.5, ""num_units"": 10.0, ""total_spikes"": 235.0}" +simulate_cif_thinning,L,7,138.86579399999732,138.25536671428398,2.017449271316567,0.20075607299804688,"{""mean_spikes_per_unit"": 36.65, ""num_units"": 20.0, ""total_spikes"": 733.0}" +decoding_spike_rate_cis,S,7,29.966428000008705,29.846530285713666,0.3328735707401364,0.2328357696533203,"{""num_trials"": 6.0, ""prob_mean"": 0.1509259259259259, ""rate_mean"": 50.4457886636761, ""sig_count"": 0.0}" +decoding_spike_rate_cis,M,7,62.59277799999552,63.25780771428567,1.3782261025910456,0.7640476226806641,"{""num_trials"": 8.0, ""prob_mean"": 0.18562499999999998, ""rate_mean"": 50.12398439148756, ""sig_count"": 0.0}" +decoding_spike_rate_cis,L,7,138.46276700000715,138.051728285717,1.750384804578685,2.7336788177490234,"{""num_trials"": 12.0, ""prob_mean"": 0.21328124999999998, ""rate_mean"": 50.073736692667104, ""sig_count"": 0.0}" diff --git a/tests/performance/fixtures/python/performance_baseline_linux_20260304.json b/tests/performance/fixtures/python/performance_baseline_linux_20260304.json new file mode 100644 index 00000000..14b9af45 --- /dev/null +++ b/tests/performance/fixtures/python/performance_baseline_linux_20260304.json @@ -0,0 +1,523 @@ +{ + "schema_version": 1, + "generated_at_utc": "2026-03-04T04:39:42Z", + "implementation": "python", + "repo_root": "/home/runner/work/nSTAT-python/nSTAT-python", + "git_sha": "bdd375cb5cfcfe637e71d22a1ff070e0ac42df80", + "tiers": [ + "S", + "M", + "L" + ], + "cases": [ + { + "case": "unit_impulse_basis", + "tier": "S", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 1.7549330000008467, + "mean_runtime_ms": 1.767814000002334, + "std_runtime_ms": 0.03712841168356124, + "median_peak_memory_mb": 0.39142704010009766, + "summary": { + "rows": 501.0, + "cols": 50.0, + "total_mass": 500.0 + }, + "samples_runtime_ms": [ + 1.8555799999973033, + 1.7672160000046233, + 1.7518669999958547, + 1.7363890000012816, + 1.7449640000108957, + 1.7549330000008467, + 1.7637490000055323 + ], + "samples_peak_memory_mb": [ + 0.39147281646728516, + 0.39145755767822266, + 0.39144229888916016, + 0.39142704010009766, + 0.39141178131103516, + 0.39138126373291016, + 0.39136600494384766 + ] + }, + { + "case": "unit_impulse_basis", + "tier": "M", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 5.230034999996747, + "mean_runtime_ms": 5.244222714285992, + "std_runtime_ms": 0.04359156024491504, + "median_peak_memory_mb": 3.076793670654297, + "summary": { + "rows": 2001.0, + "cols": 100.0, + "total_mass": 2000.0 + }, + "samples_runtime_ms": [ + 5.208524999986253, + 5.255633000004423, + 5.230034999996747, + 5.205519000000436, + 5.243821000007642, + 5.342964000007555, + 5.22306199999889 + ], + "samples_peak_memory_mb": [ + 3.0769615173339844, + 3.076946258544922, + 3.0769309997558594, + 3.076793670654297, + 3.0767784118652344, + 3.076770782470703, + 3.076763153076172 + ] + }, + { + "case": "unit_impulse_basis", + "tier": "L", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 10.598025999996707, + "mean_runtime_ms": 10.61451614285959, + "std_runtime_ms": 0.03639945260812847, + "median_peak_memory_mb": 18.373775482177734, + "summary": { + "rows": 6001.0, + "cols": 200.0, + "total_mass": 6000.0 + }, + "samples_runtime_ms": [ + 10.596453000005113, + 10.691879000006566, + 10.640424000001758, + 10.591734000001907, + 10.576616000008698, + 10.606480999996393, + 10.598025999996707 + ], + "samples_peak_memory_mb": [ + 18.37380599975586, + 18.373798370361328, + 18.373790740966797, + 18.373775482177734, + 18.373767852783203, + 18.373760223388672, + 18.37375259399414 + ] + }, + { + "case": "covariate_resample", + "tier": "S", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 0.3187809999900537, + "mean_runtime_ms": 0.31151314285742565, + "std_runtime_ms": 0.027315777992366848, + "median_peak_memory_mb": 0.06198883056640625, + "summary": { + "rows": 1001.0, + "cols": 1.0, + "signal_energy": 0.5195204795204795 + }, + "samples_runtime_ms": [ + 0.3284590000021126, + 0.3187809999900537, + 0.35717200000817684, + 0.2866319999981215, + 0.27595100000610273, + 0.32765699999970366, + 0.2859399999977086 + ], + "samples_peak_memory_mb": [ + 0.06198883056640625, + 0.06198883056640625, + 0.06198883056640625, + 0.06198883056640625, + 0.06198883056640625, + 0.06198883056640625, + 0.06198883056640625 + ] + }, + { + "case": "covariate_resample", + "tier": "M", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 0.3733120000077861, + "mean_runtime_ms": 0.3833688571432958, + "std_runtime_ms": 0.027679721024068783, + "median_peak_memory_mb": 0.1355915069580078, + "summary": { + "rows": 3001.0, + "cols": 1.0, + "signal_energy": 0.5198042747802833 + }, + "samples_runtime_ms": [ + 0.4426099999932376, + 0.3733120000077861, + 0.38337099999807833, + 0.3641439999881868, + 0.3593360000024859, + 0.40025100000207203, + 0.36055800001122407 + ], + "samples_peak_memory_mb": [ + 0.1355915069580078, + 0.1355915069580078, + 0.1355915069580078, + 0.1355915069580078, + 0.1355915069580078, + 0.1355915069580078, + 0.1355915069580078 + ] + }, + { + "case": "covariate_resample", + "tier": "L", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 0.4388640000030364, + "mean_runtime_ms": 0.43310742857321266, + "std_runtime_ms": 0.016053381349413916, + "median_peak_memory_mb": 0.2376575469970703, + "summary": { + "rows": 6001.0, + "cols": 1.0, + "signal_energy": 0.5199200133311115 + }, + "samples_runtime_ms": [ + 0.4090890000014724, + 0.43946499999947264, + 0.41093200000830166, + 0.4388640000030364, + 0.44900299999994786, + 0.43115900000145757, + 0.45323999999880016 + ], + "samples_peak_memory_mb": [ + 0.23763465881347656, + 0.23763465881347656, + 0.2376575469970703, + 0.2376575469970703, + 0.2376575469970703, + 0.2376575469970703, + 0.2376575469970703 + ] + }, + { + "case": "history_design_matrix", + "tier": "S", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 0.34009099999821046, + "mean_runtime_ms": 0.3473752857193598, + "std_runtime_ms": 0.02510644529088935, + "median_peak_memory_mb": 0.0890655517578125, + "summary": { + "rows": 1000.0, + "cols": 4.0, + "total_count": 9737.0 + }, + "samples_runtime_ms": [ + 0.39858800001013606, + 0.35199200000590736, + 0.3244020000039427, + 0.36373400000400125, + 0.33092400001066835, + 0.3218960000026527, + 0.34009099999821046 + ], + "samples_peak_memory_mb": [ + 0.08905029296875, + 0.0890655517578125, + 0.089080810546875, + 0.08907318115234375, + 0.0890655517578125, + 0.08905029296875, + 0.08904266357421875 + ] + }, + { + "case": "history_design_matrix", + "tier": "M", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 0.7935309999993478, + "mean_runtime_ms": 0.8029685714266829, + "std_runtime_ms": 0.02089668436143026, + "median_peak_memory_mb": 0.43688201904296875, + "summary": { + "rows": 5000.0, + "cols": 4.0, + "total_count": 243740.0 + }, + "samples_runtime_ms": [ + 0.8134179999927937, + 0.8033690000104343, + 0.7935309999993478, + 0.7772609999960878, + 0.7933710000003202, + 0.7922479999962206, + 0.8475819999915757 + ], + "samples_peak_memory_mb": [ + 0.4368743896484375, + 0.4368896484375, + 0.4369049072265625, + 0.43689727783203125, + 0.43688201904296875, + 0.4368743896484375, + 0.43686676025390625 + ] + }, + { + "case": "history_design_matrix", + "tier": "L", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 1.6487759999961327, + "mean_runtime_ms": 1.6487702857140059, + "std_runtime_ms": 0.03152991976246771, + "median_peak_memory_mb": 0.8869400024414062, + "summary": { + "rows": 10000.0, + "cols": 4.0, + "total_count": 1462420.0 + }, + "samples_runtime_ms": [ + 1.6667189999992615, + 1.6867770000033033, + 1.6265349999997625, + 1.6487759999961327, + 1.6904029999977865, + 1.6031809999930147, + 1.6190010000087796 + ], + "samples_peak_memory_mb": [ + 0.886932373046875, + 0.8869476318359375, + 0.886962890625, + 0.8869476318359375, + 0.8869400024414062, + 0.886932373046875, + 0.8869247436523438 + ] + }, + { + "case": "simulate_cif_thinning", + "tier": "S", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 11.90902599999788, + "mean_runtime_ms": 11.841307428570401, + "std_runtime_ms": 0.2313883554103292, + "median_peak_memory_mb": 0.07111740112304688, + "summary": { + "num_units": 5.0, + "total_spikes": 69.0, + "mean_spikes_per_unit": 13.8 + }, + "samples_runtime_ms": [ + 11.399260999993999, + 11.735373999997023, + 12.0300100000037, + 11.90902599999788, + 11.70961600000453, + 12.155522999989898, + 11.950342000005776 + ], + "samples_peak_memory_mb": [ + 0.07123947143554688, + 0.07107925415039062, + 0.07111740112304688, + 0.07104873657226562, + 0.07101821899414062, + 0.07143020629882812, + 0.07115554809570312 + ] + }, + { + "case": "simulate_cif_thinning", + "tier": "M", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 46.7842029999872, + "mean_runtime_ms": 46.623392285716086, + "std_runtime_ms": 0.34392725136599767, + "median_peak_memory_mb": 0.13444900512695312, + "summary": { + "num_units": 10.0, + "total_spikes": 235.0, + "mean_spikes_per_unit": 23.5 + }, + "samples_runtime_ms": [ + 46.661997000001065, + 46.807135999998195, + 46.9655890000098, + 46.7842029999872, + 46.52009400000168, + 45.840324000010924, + 46.78440300000375 + ], + "samples_peak_memory_mb": [ + 0.13444900512695312, + 0.13436508178710938, + 0.13444900512695312, + 0.13457107543945312, + 0.13434982299804688, + 0.13453292846679688, + 0.13437271118164062 + ] + }, + { + "case": "simulate_cif_thinning", + "tier": "L", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 138.86579399999732, + "mean_runtime_ms": 138.25536671428398, + "std_runtime_ms": 2.017449271316567, + "median_peak_memory_mb": 0.20075607299804688, + "summary": { + "num_units": 20.0, + "total_spikes": 733.0, + "mean_spikes_per_unit": 36.65 + }, + "samples_runtime_ms": [ + 140.78421000000674, + 138.86579399999732, + 139.35017199999322, + 136.84719200000472, + 134.61053599999673, + 137.09617399999274, + 140.23348899999633 + ], + "samples_peak_memory_mb": [ + 0.20079421997070312, + 0.20088577270507812, + 0.20074844360351562, + 0.20075607299804688, + 0.20060348510742188, + 0.20046615600585938, + 0.20093917846679688 + ] + }, + { + "case": "decoding_spike_rate_cis", + "tier": "S", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 29.966428000008705, + "mean_runtime_ms": 29.846530285713666, + "std_runtime_ms": 0.3328735707401364, + "median_peak_memory_mb": 0.2328357696533203, + "summary": { + "num_trials": 6.0, + "prob_mean": 0.1509259259259259, + "sig_count": 0.0, + "rate_mean": 50.4457886636761 + }, + "samples_runtime_ms": [ + 29.377155999995352, + 30.0538899999907, + 29.957221000003642, + 30.260263000002396, + 29.307927999994376, + 30.002826000000482, + 29.966428000008705 + ], + "samples_peak_memory_mb": [ + 0.2328357696533203, + 0.2328357696533203, + 0.2328357696533203, + 0.2328357696533203, + 0.2328357696533203, + 0.2328357696533203, + 0.2328357696533203 + ] + }, + { + "case": "decoding_spike_rate_cis", + "tier": "M", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 62.59277799999552, + "mean_runtime_ms": 63.25780771428567, + "std_runtime_ms": 1.3782261025910456, + "median_peak_memory_mb": 0.7640476226806641, + "summary": { + "num_trials": 8.0, + "prob_mean": 0.18562499999999998, + "sig_count": 0.0, + "rate_mean": 50.12398439148756 + }, + "samples_runtime_ms": [ + 62.16685800001187, + 62.88427799999852, + 62.58198800000514, + 62.59277799999552, + 62.56363399999998, + 66.50133299999084, + 63.513784999997824 + ], + "samples_peak_memory_mb": [ + 0.7639484405517578, + 0.7639980316162109, + 0.7639980316162109, + 0.7640476226806641, + 0.7641468048095703, + 0.7642955780029297, + 0.7643184661865234 + ] + }, + { + "case": "decoding_spike_rate_cis", + "tier": "L", + "repeats": 7, + "warmup": 2, + "median_runtime_ms": 138.46276700000715, + "mean_runtime_ms": 138.051728285717, + "std_runtime_ms": 1.750384804578685, + "median_peak_memory_mb": 2.7336788177490234, + "summary": { + "num_trials": 12.0, + "prob_mean": 0.21328124999999998, + "sig_count": 0.0, + "rate_mean": 50.073736692667104 + }, + "samples_runtime_ms": [ + 138.46276700000715, + 138.7568119999969, + 137.5696620000042, + 139.4315430000006, + 139.74387199999683, + 138.30987400000083, + 134.08756800001242 + ], + "samples_peak_memory_mb": [ + 2.7335567474365234, + 2.7336788177490234, + 2.7338504791259766, + 2.733602523803711, + 2.733701705932617, + 2.7337512969970703, + 2.733602523803711 + ] + } + ], + "environment": { + "python": "3.11.14", + "platform": "Linux-6.14.0-1017-azure-x86_64-with-glibc2.39", + "numpy": "2.4.2", + "scipy": "1.17.1", + "matplotlib": "3.10.8", + "omp_num_threads": "1", + "mkl_num_threads": "1", + "openblas_num_threads": "1", + "veclib_maximum_threads": "1" + } +} \ No newline at end of file diff --git a/tests/test_performance_reports.py b/tests/test_performance_reports.py index a79d52b9..86bdf726 100644 --- a/tests/test_performance_reports.py +++ b/tests/test_performance_reports.py @@ -9,9 +9,12 @@ def _load(path: Path) -> dict: return json.loads(path.read_text(encoding="utf-8")) +LINUX_BASELINE = Path("tests/performance/fixtures/python/performance_baseline_linux_20260304.json") + + def test_performance_fixture_coverage() -> None: matlab = _load(Path("tests/performance/fixtures/matlab/performance_baseline_470fde8.json")) - python = _load(Path("tests/performance/fixtures/python/performance_baseline_20260303.json")) + python = _load(LINUX_BASELINE) matlab_pairs = {(row["case"], row["tier"]) for row in matlab["cases"]} python_pairs = {(row["case"], row["tier"]) for row in python["cases"]} @@ -26,18 +29,19 @@ def test_performance_comparator_runs(tmp_path: Path) -> None: "python", "tools/performance/compare_matlab_python_performance.py", "--python-report", - "tests/performance/fixtures/python/performance_baseline_20260303.json", + str(LINUX_BASELINE), "--matlab-report", "tests/performance/fixtures/matlab/performance_baseline_470fde8.json", "--policy", "parity/performance_gate_policy.yml", "--previous-python-report", - "tests/performance/fixtures/python/performance_baseline_20260303.json", + str(LINUX_BASELINE), "--report-out", str(out_json), "--csv-out", str(out_csv), "--fail-on-regression", + "--require-regression-env-match", ] subprocess.run(cmd, check=True) @@ -48,8 +52,8 @@ def test_performance_comparator_runs(tmp_path: Path) -> None: def test_performance_comparator_skips_regression_on_env_mismatch(tmp_path: Path) -> None: - python_report = _load(Path("tests/performance/fixtures/python/performance_baseline_20260303.json")) - previous_report = _load(Path("tests/performance/fixtures/python/performance_baseline_20260303.json")) + python_report = _load(LINUX_BASELINE) + previous_report = _load(LINUX_BASELINE) # Force a would-be regression while also making previous env non-comparable. python_report["cases"][0]["median_runtime_ms"] = float(python_report["cases"][0]["median_runtime_ms"]) * 5.0 @@ -85,3 +89,39 @@ def test_performance_comparator_skips_regression_on_env_mismatch(tmp_path: Path) report = _load(out_json) assert report["policy"]["regression_env_compatible"] is False assert report["counts"]["regression_failures"] == 0 + + +def test_performance_comparator_can_require_env_match(tmp_path: Path) -> None: + python_report = _load(LINUX_BASELINE) + previous_report = _load(LINUX_BASELINE) + + previous_report["environment"]["platform"] = "Linux-test-x86_64" + previous_report["environment"]["python"] = "3.11.9" + + python_path = tmp_path / "python_report.json" + previous_path = tmp_path / "previous_report.json" + python_path.write_text(json.dumps(python_report), encoding="utf-8") + previous_path.write_text(json.dumps(previous_report), encoding="utf-8") + + out_json = tmp_path / "perf_report_env_required.json" + out_csv = tmp_path / "perf_report_env_required.csv" + cmd = [ + "python", + "tools/performance/compare_matlab_python_performance.py", + "--python-report", + str(python_path), + "--matlab-report", + "tests/performance/fixtures/matlab/performance_baseline_470fde8.json", + "--policy", + "parity/performance_gate_policy.yml", + "--previous-python-report", + str(previous_path), + "--report-out", + str(out_json), + "--csv-out", + str(out_csv), + "--fail-on-regression", + "--require-regression-env-match", + ] + proc = subprocess.run(cmd, check=False) + assert proc.returncode != 0 diff --git a/tools/performance/compare_matlab_python_performance.py b/tools/performance/compare_matlab_python_performance.py index 01c84e54..3eac0468 100755 --- a/tools/performance/compare_matlab_python_performance.py +++ b/tools/performance/compare_matlab_python_performance.py @@ -71,6 +71,14 @@ def main() -> int: action="store_true", help="Return non-zero when Python runtime regresses beyond threshold vs previous report.", ) + parser.add_argument( + "--require-regression-env-match", + action="store_true", + help=( + "When set, fail if previous Python baseline exists but benchmark " + "environment metadata is not comparable." + ), + ) parser.add_argument( "--fail-on-matlab-ratio", action="store_true", @@ -93,6 +101,12 @@ def main() -> int: if regression_env_compatible: prev_idx = _index_cases(prev.get("cases", [])) else: + if args.require_regression_env_match: + print( + "Regression environment mismatch and --require-regression-env-match was set: " + f"current={py_report.get('environment', {})}, previous={prev.get('environment', {})}" + ) + return 1 print( "Skipping regression gating: benchmark environments are not comparable " f"(current={py_report.get('environment', {})}, previous={prev.get('environment', {})})"