diff --git a/benchmarks/speed_review/README.md b/benchmarks/speed_review/README.md new file mode 100644 index 00000000..35440b2b --- /dev/null +++ b/benchmarks/speed_review/README.md @@ -0,0 +1,92 @@ +# Speed Review - Practitioner Workflow Benchmarks + +Scenario-driven performance measurement for end-to-end practitioner chains, +as distinct from `benchmarks/run_benchmarks.py` which measures R-parity on +isolated `fit()` calls. + +## Why these exist + +See [`docs/performance-scenarios.md`](../../docs/performance-scenarios.md) for +the full methodology. Short version: the existing benchmarks measure +`fit()` in isolation on 200 x 8 synthetic panels, which does not reflect what +a practitioner running the 8-step Baker et al. (2025) workflow on a real +BRFSS or geo-experiment panel actually sees. These scripts measure the full +chain (Bacon -> fit -> HonestDiD -> cross-estimator robustness -> reporting) +at data shapes anchored to applied-econ conventions. + +## Layout + +``` +benchmarks/speed_review/ +├── README.md # this file +├── bench_shared.py # timing + pyinstrument + RSS harness +├── run_all.py # orchestrator (both backends) +├── bench_campaign_staggered.py # Scenario 1: CS + 8-step chain +├── bench_brand_awareness_survey.py # Scenario 2: DiD + SurveyDesign +├── bench_brfss_panel.py # Scenario 3: aggregate_survey -> CS +├── bench_geo_few_markets.py # Scenario 4: SDiD + jackknife +├── bench_reversible_dcdh.py # Scenario 5: dCDH L_max + TSL +├── bench_dose_response.py # Scenario 6: ContinuousDiD splines +├── mem_profile_brfss.py # tracemalloc allocator attribution +│ # for BRFSS-1M (standalone) +├── bench_callaway.py # pre-existing CS scaling sweep +├── baseline_results.json # pre-existing CS baseline +└── baselines/ # this effort's output + ├── _.json # phase-level wall-clock + peak RSS + ├── mem_profile_brfss_large_.txt # tracemalloc top-N sites + └── profiles/ # flame HTMLs (gitignored) + └── _.html # pyinstrument flame output +``` + +Each JSON baseline records both timing (per-phase wall-clock) and memory +(start/peak/growth from a psutil background sampler at 10 ms). The +`mem_profile_brfss.py` script does a separate tracemalloc pass on the +BRFSS-1M scenario - this is kept out of the main timing harness because +tracemalloc has 2-5x overhead and would contaminate wall-clock baselines. + +**Note on profile HTMLs.** pyinstrument flames are ~500KB-1.2MB each and are +regenerated on every run; they live under `baselines/profiles/` which is +gitignored. The key hotspots identified from them are already captured in +the findings doc (top-5 hot phases per scenario); run a scenario locally +to regenerate the full flame when needed. + +## Running + +```bash +# One-time install +pip install pyinstrument + +# All scenarios, both backends, all scales +python benchmarks/speed_review/run_all.py + +# One scenario, one backend (the script runs its full scale sweep internally) +DIFF_DIFF_BACKEND=rust python benchmarks/speed_review/bench_campaign_staggered.py + +# Subset +python benchmarks/speed_review/run_all.py --scenarios brfss_panel geo_few_markets +``` + +Multi-scale scenarios write per-scale outputs +(e.g. `campaign_staggered_small_rust.json`, `..._medium_rust.json`, +`..._large_rust.json`). Single-scale scenarios write the scale-free form +(e.g. `dose_response_rust.json`). Full runtime for all scales × both +backends is ~90 seconds on Apple Silicon M4. + +## Where to look for findings + +[`docs/performance-plan.md`](../../docs/performance-plan.md) - "Practitioner +Workflow Baseline (v3.1.3)" section holds per-scenario hot-phase rankings +and action recommendations. The scenarios here are the measurement surface; +the findings doc is the decision output. + +## Adding a scenario + +1. Add the scenario definition to `docs/performance-scenarios.md` + (persona, data shape, operation chain, source anchor). +2. Add `bench_.py` following the existing scripts: build data, define + `phases` as a list of `(label, callable)` tuples, call `run_scenario`. +3. Register it in `run_all.py`'s `SCRIPTS` dict. +4. Run under both backends and commit the refreshed `baselines/*.json`. + The `baselines/profiles/*.html` flame HTMLs are gitignored and + regenerated per run - do not commit them. +5. Add a per-scenario finding paragraph to `docs/performance-plan.md`. diff --git a/benchmarks/speed_review/baselines/.gitignore b/benchmarks/speed_review/baselines/.gitignore new file mode 100644 index 00000000..66d050c3 --- /dev/null +++ b/benchmarks/speed_review/baselines/.gitignore @@ -0,0 +1 @@ +profiles/ diff --git a/benchmarks/speed_review/baselines/brand_awareness_survey_large_python.json b/benchmarks/speed_review/baselines/brand_awareness_survey_large_python.json new file mode 100644 index 00000000..c8eb9108 --- /dev/null +++ b/benchmarks/speed_review/baselines/brand_awareness_survey_large_python.json @@ -0,0 +1,66 @@ +{ + "scenario": "brand_awareness_survey_large", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 1.0910496250000001, + "memory": { + "available": true, + "start_mb": 188.45, + "peak_mb": 327.44, + "growth_mb": 138.98, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_naive_fit_no_survey_design": { + "seconds": 0.009826500000000182, + "ok": true, + "error": null + }, + "2_tsl_strata_psu_fpc": { + "seconds": 0.030280333999999964, + "ok": true, + "error": null + }, + "3_replicate_weights_jk1": { + "seconds": 0.6243122919999999, + "ok": true, + "error": null + }, + "4_multi_outcome_loop_3_metrics": { + "seconds": 0.24174716599999968, + "ok": true, + "error": null + }, + "5_check_parallel_trends": { + "seconds": 0.025623749999999834, + "ok": true, + "error": null + }, + "6_placebo_refit_pre_period": { + "seconds": 0.01191299999999984, + "ok": true, + "error": null + }, + "7_event_study_plus_honest_did": { + "seconds": 0.147335875, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "large", + "n_units": 1000, + "n_periods": 12, + "n_obs": 12000, + "n_strata": 20, + "n_psu_per_stratum": 8, + "n_replicate_weights": 160, + "outcomes": [ + "outcome", + "consideration", + "purchase_intent" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brand_awareness_survey_large_rust.json b/benchmarks/speed_review/baselines/brand_awareness_survey_large_rust.json new file mode 100644 index 00000000..a3eb721c --- /dev/null +++ b/benchmarks/speed_review/baselines/brand_awareness_survey_large_rust.json @@ -0,0 +1,66 @@ +{ + "scenario": "brand_awareness_survey_large", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 1.0000031249999999, + "memory": { + "available": true, + "start_mb": 194.03, + "peak_mb": 336.08, + "growth_mb": 142.05, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_naive_fit_no_survey_design": { + "seconds": 0.013511041000000112, + "ok": true, + "error": null + }, + "2_tsl_strata_psu_fpc": { + "seconds": 0.03037650000000003, + "ok": true, + "error": null + }, + "3_replicate_weights_jk1": { + "seconds": 0.5431151669999998, + "ok": true, + "error": null + }, + "4_multi_outcome_loop_3_metrics": { + "seconds": 0.21752962499999962, + "ok": true, + "error": null + }, + "5_check_parallel_trends": { + "seconds": 0.04399687500000038, + "ok": true, + "error": null + }, + "6_placebo_refit_pre_period": { + "seconds": 0.016433082999999904, + "ok": true, + "error": null + }, + "7_event_study_plus_honest_did": { + "seconds": 0.13501837500000002, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "large", + "n_units": 1000, + "n_periods": 12, + "n_obs": 12000, + "n_strata": 20, + "n_psu_per_stratum": 8, + "n_replicate_weights": 160, + "outcomes": [ + "outcome", + "consideration", + "purchase_intent" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brand_awareness_survey_medium_python.json b/benchmarks/speed_review/baselines/brand_awareness_survey_medium_python.json new file mode 100644 index 00000000..869c5393 --- /dev/null +++ b/benchmarks/speed_review/baselines/brand_awareness_survey_medium_python.json @@ -0,0 +1,66 @@ +{ + "scenario": "brand_awareness_survey_medium", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 0.563283334, + "memory": { + "available": true, + "start_mb": 133.69, + "peak_mb": 187.7, + "growth_mb": 54.02, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_naive_fit_no_survey_design": { + "seconds": 0.010921792000000097, + "ok": true, + "error": null + }, + "2_tsl_strata_psu_fpc": { + "seconds": 0.03732066599999995, + "ok": true, + "error": null + }, + "3_replicate_weights_jk1": { + "seconds": 0.20805304199999997, + "ok": true, + "error": null + }, + "4_multi_outcome_loop_3_metrics": { + "seconds": 0.12622899999999992, + "ok": true, + "error": null + }, + "5_check_parallel_trends": { + "seconds": 0.01834783299999998, + "ok": true, + "error": null + }, + "6_placebo_refit_pre_period": { + "seconds": 0.054030583000000076, + "ok": true, + "error": null + }, + "7_event_study_plus_honest_did": { + "seconds": 0.10836029199999997, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_units": 500, + "n_periods": 12, + "n_obs": 6000, + "n_strata": 15, + "n_psu_per_stratum": 6, + "n_replicate_weights": 90, + "outcomes": [ + "outcome", + "consideration", + "purchase_intent" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brand_awareness_survey_medium_rust.json b/benchmarks/speed_review/baselines/brand_awareness_survey_medium_rust.json new file mode 100644 index 00000000..2ceed1ca --- /dev/null +++ b/benchmarks/speed_review/baselines/brand_awareness_survey_medium_rust.json @@ -0,0 +1,66 @@ +{ + "scenario": "brand_awareness_survey_medium", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.5500554579999999, + "memory": { + "available": true, + "start_mb": 135.36, + "peak_mb": 184.86, + "growth_mb": 49.5, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_naive_fit_no_survey_design": { + "seconds": 0.011186999999999947, + "ok": true, + "error": null + }, + "2_tsl_strata_psu_fpc": { + "seconds": 0.03363270800000007, + "ok": true, + "error": null + }, + "3_replicate_weights_jk1": { + "seconds": 0.18678066699999996, + "ok": true, + "error": null + }, + "4_multi_outcome_loop_3_metrics": { + "seconds": 0.16038787500000007, + "ok": true, + "error": null + }, + "5_check_parallel_trends": { + "seconds": 0.022171542000000155, + "ok": true, + "error": null + }, + "6_placebo_refit_pre_period": { + "seconds": 0.0532650830000001, + "ok": true, + "error": null + }, + "7_event_study_plus_honest_did": { + "seconds": 0.08262075000000002, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_units": 500, + "n_periods": 12, + "n_obs": 6000, + "n_strata": 15, + "n_psu_per_stratum": 6, + "n_replicate_weights": 90, + "outcomes": [ + "outcome", + "consideration", + "purchase_intent" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brand_awareness_survey_small_python.json b/benchmarks/speed_review/baselines/brand_awareness_survey_small_python.json new file mode 100644 index 00000000..699da724 --- /dev/null +++ b/benchmarks/speed_review/baselines/brand_awareness_survey_small_python.json @@ -0,0 +1,66 @@ +{ + "scenario": "brand_awareness_survey_small", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 0.19338629200000002, + "memory": { + "available": true, + "start_mb": 115.48, + "peak_mb": 127.31, + "growth_mb": 11.83, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_naive_fit_no_survey_design": { + "seconds": 0.0014470410000000378, + "ok": true, + "error": null + }, + "2_tsl_strata_psu_fpc": { + "seconds": 0.0072707499999999925, + "ok": true, + "error": null + }, + "3_replicate_weights_jk1": { + "seconds": 0.023173292000000068, + "ok": true, + "error": null + }, + "4_multi_outcome_loop_3_metrics": { + "seconds": 0.03375529200000005, + "ok": true, + "error": null + }, + "5_check_parallel_trends": { + "seconds": 0.01041325000000004, + "ok": true, + "error": null + }, + "6_placebo_refit_pre_period": { + "seconds": 0.027520249999999913, + "ok": true, + "error": null + }, + "7_event_study_plus_honest_did": { + "seconds": 0.08979433299999995, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_units": 200, + "n_periods": 12, + "n_obs": 2400, + "n_strata": 10, + "n_psu_per_stratum": 4, + "n_replicate_weights": 40, + "outcomes": [ + "outcome", + "consideration", + "purchase_intent" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brand_awareness_survey_small_rust.json b/benchmarks/speed_review/baselines/brand_awareness_survey_small_rust.json new file mode 100644 index 00000000..006bc684 --- /dev/null +++ b/benchmarks/speed_review/baselines/brand_awareness_survey_small_rust.json @@ -0,0 +1,66 @@ +{ + "scenario": "brand_awareness_survey_small", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.19669587500000008, + "memory": { + "available": true, + "start_mb": 114.78, + "peak_mb": 127.91, + "growth_mb": 13.12, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_naive_fit_no_survey_design": { + "seconds": 0.0016678749999999853, + "ok": true, + "error": null + }, + "2_tsl_strata_psu_fpc": { + "seconds": 0.005756874999999995, + "ok": true, + "error": null + }, + "3_replicate_weights_jk1": { + "seconds": 0.012066042000000055, + "ok": true, + "error": null + }, + "4_multi_outcome_loop_3_metrics": { + "seconds": 0.05887395800000006, + "ok": true, + "error": null + }, + "5_check_parallel_trends": { + "seconds": 0.008938375000000054, + "ok": true, + "error": null + }, + "6_placebo_refit_pre_period": { + "seconds": 0.0274049999999999, + "ok": true, + "error": null + }, + "7_event_study_plus_honest_did": { + "seconds": 0.08197737500000002, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_units": 200, + "n_periods": 12, + "n_obs": 2400, + "n_strata": 10, + "n_psu_per_stratum": 4, + "n_replicate_weights": 40, + "outcomes": [ + "outcome", + "consideration", + "purchase_intent" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brfss_panel_large_python.json b/benchmarks/speed_review/baselines/brfss_panel_large_python.json new file mode 100644 index 00000000..1772355b --- /dev/null +++ b/benchmarks/speed_review/baselines/brfss_panel_large_python.json @@ -0,0 +1,56 @@ +{ + "scenario": "brfss_panel_large", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 24.406984582999996, + "memory": { + "available": true, + "start_mb": 401.05, + "peak_mb": 418.12, + "growth_mb": 17.08, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_aggregate_survey_microdata_to_panel": { + "seconds": 24.295822291, + "ok": true, + "error": null + }, + "2_cs_fit_with_stage2_survey_design": { + "seconds": 0.012265292000002148, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.2919999977943917e-06, + "ok": true, + "error": null + }, + "4_honest_did_grid": { + "seconds": 0.0016812089999973523, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.09669395799999592, + "ok": true, + "error": null + }, + "6_practitioner_next_steps": { + "seconds": 0.0005083750000025589, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "large", + "n_microdata_rows": 1000000, + "n_states": 50, + "n_years": 10, + "n_strata": 20, + "n_psu": 1000, + "n_bootstrap": 199 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brfss_panel_large_rust.json b/benchmarks/speed_review/baselines/brfss_panel_large_rust.json new file mode 100644 index 00000000..886c63cc --- /dev/null +++ b/benchmarks/speed_review/baselines/brfss_panel_large_rust.json @@ -0,0 +1,56 @@ +{ + "scenario": "brfss_panel_large", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 24.936181916, + "memory": { + "available": true, + "start_mb": 396.06, + "peak_mb": 429.31, + "growth_mb": 33.25, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_aggregate_survey_microdata_to_panel": { + "seconds": 24.820139083, + "ok": true, + "error": null + }, + "2_cs_fit_with_stage2_survey_design": { + "seconds": 0.012674374999996019, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.500000000793534e-06, + "ok": true, + "error": null + }, + "4_honest_did_grid": { + "seconds": 0.0015977500000019518, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.10144270800000044, + "ok": true, + "error": null + }, + "6_practitioner_next_steps": { + "seconds": 0.00030387500000017553, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "large", + "n_microdata_rows": 1000000, + "n_states": 50, + "n_years": 10, + "n_strata": 20, + "n_psu": 1000, + "n_bootstrap": 199 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brfss_panel_medium_python.json b/benchmarks/speed_review/baselines/brfss_panel_medium_python.json new file mode 100644 index 00000000..91e5e648 --- /dev/null +++ b/benchmarks/speed_review/baselines/brfss_panel_medium_python.json @@ -0,0 +1,56 @@ +{ + "scenario": "brfss_panel_medium", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 6.096216417, + "memory": { + "available": true, + "start_mb": 193.25, + "peak_mb": 209.78, + "growth_mb": 16.53, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_aggregate_survey_microdata_to_panel": { + "seconds": 5.9895347910000005, + "ok": true, + "error": null + }, + "2_cs_fit_with_stage2_survey_design": { + "seconds": 0.012643416999999602, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.166999999886343e-06, + "ok": true, + "error": null + }, + "4_honest_did_grid": { + "seconds": 0.0015969160000004479, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.0921533340000007, + "ok": true, + "error": null + }, + "6_practitioner_next_steps": { + "seconds": 0.0002710829999994502, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_microdata_rows": 250000, + "n_states": 50, + "n_years": 10, + "n_strata": 15, + "n_psu": 600, + "n_bootstrap": 199 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brfss_panel_medium_rust.json b/benchmarks/speed_review/baselines/brfss_panel_medium_rust.json new file mode 100644 index 00000000..670b3135 --- /dev/null +++ b/benchmarks/speed_review/baselines/brfss_panel_medium_rust.json @@ -0,0 +1,56 @@ +{ + "scenario": "brfss_panel_medium", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 6.228102207999999, + "memory": { + "available": true, + "start_mb": 197.56, + "peak_mb": 212.22, + "growth_mb": 14.66, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_aggregate_survey_microdata_to_panel": { + "seconds": 6.142273, + "ok": true, + "error": null + }, + "2_cs_fit_with_stage2_survey_design": { + "seconds": 0.012037416000000078, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.1249999999639613e-06, + "ok": true, + "error": null + }, + "4_honest_did_grid": { + "seconds": 0.0016153329999983868, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.07184195800000026, + "ok": true, + "error": null + }, + "6_practitioner_next_steps": { + "seconds": 0.0003229160000000064, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_microdata_rows": 250000, + "n_states": 50, + "n_years": 10, + "n_strata": 15, + "n_psu": 600, + "n_bootstrap": 199 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brfss_panel_small_python.json b/benchmarks/speed_review/baselines/brfss_panel_small_python.json new file mode 100644 index 00000000..093a7daf --- /dev/null +++ b/benchmarks/speed_review/baselines/brfss_panel_small_python.json @@ -0,0 +1,56 @@ +{ + "scenario": "brfss_panel_small", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 1.608562042, + "memory": { + "available": true, + "start_mb": 121.97, + "peak_mb": 133.39, + "growth_mb": 11.42, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_aggregate_survey_microdata_to_panel": { + "seconds": 1.523675458, + "ok": true, + "error": null + }, + "2_cs_fit_with_stage2_survey_design": { + "seconds": 0.015124000000000137, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.165999999803603e-06, + "ok": true, + "error": null + }, + "4_honest_did_grid": { + "seconds": 0.004194041999999953, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.0653021250000001, + "ok": true, + "error": null + }, + "6_practitioner_next_steps": { + "seconds": 0.00026012500000005545, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_microdata_rows": 50000, + "n_states": 50, + "n_years": 10, + "n_strata": 10, + "n_psu": 200, + "n_bootstrap": 199 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/brfss_panel_small_rust.json b/benchmarks/speed_review/baselines/brfss_panel_small_rust.json new file mode 100644 index 00000000..a1f19a21 --- /dev/null +++ b/benchmarks/speed_review/baselines/brfss_panel_small_rust.json @@ -0,0 +1,56 @@ +{ + "scenario": "brfss_panel_small", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 1.6610665, + "memory": { + "available": true, + "start_mb": 121.16, + "peak_mb": 136.44, + "growth_mb": 15.28, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_aggregate_survey_microdata_to_panel": { + "seconds": 1.5438897920000003, + "ok": true, + "error": null + }, + "2_cs_fit_with_stage2_survey_design": { + "seconds": 0.01586162499999988, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.4999999999053557e-06, + "ok": true, + "error": null + }, + "4_honest_did_grid": { + "seconds": 0.003953542000000088, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.09701791599999998, + "ok": true, + "error": null + }, + "6_practitioner_next_steps": { + "seconds": 0.00032904199999972406, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_microdata_rows": 50000, + "n_states": 50, + "n_years": 10, + "n_strata": 10, + "n_psu": 200, + "n_bootstrap": 199 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/campaign_staggered_large_python.json b/benchmarks/speed_review/baselines/campaign_staggered_large_python.json new file mode 100644 index 00000000..0c2dc359 --- /dev/null +++ b/benchmarks/speed_review/baselines/campaign_staggered_large_python.json @@ -0,0 +1,71 @@ +{ + "scenario": "campaign_staggered_large", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 1.3326843750000001, + "memory": { + "available": true, + "start_mb": 227.28, + "peak_mb": 472.22, + "growth_mb": 244.94, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_bacon_decomposition": { + "seconds": 0.019139459000000025, + "ok": true, + "error": null + }, + "2_cs_fit_with_covariates_bootstrap999": { + "seconds": 0.16680450000000002, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 3.042000000341716e-06, + "ok": true, + "error": null + }, + "4_honest_did_M_grid": { + "seconds": 0.002607332999999823, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.3669262500000001, + "ok": true, + "error": null + }, + "6_imputation_did_robustness": { + "seconds": 0.649511, + "ok": true, + "error": null + }, + "7_cs_without_covariates": { + "seconds": 0.12763954200000027, + "ok": true, + "error": null + }, + "8_practitioner_next_steps": { + "seconds": 4.033299999983697e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "large", + "n_units": 1500, + "n_periods": 26, + "n_cohorts": 3, + "n_obs": 39000, + "covariates": [ + "log_pop", + "baseline_spend" + ], + "n_bootstrap": 999, + "aggregate": "all", + "estimation_method": "dr" + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/campaign_staggered_large_rust.json b/benchmarks/speed_review/baselines/campaign_staggered_large_rust.json new file mode 100644 index 00000000..6766f7ac --- /dev/null +++ b/benchmarks/speed_review/baselines/campaign_staggered_large_rust.json @@ -0,0 +1,71 @@ +{ + "scenario": "campaign_staggered_large", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 1.3826507919999997, + "memory": { + "available": true, + "start_mb": 265.8, + "peak_mb": 587.92, + "growth_mb": 322.12, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_bacon_decomposition": { + "seconds": 0.019430332999999855, + "ok": true, + "error": null + }, + "2_cs_fit_with_covariates_bootstrap999": { + "seconds": 0.17791104199999985, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 3.5419999999675156e-06, + "ok": true, + "error": null + }, + "4_honest_did_M_grid": { + "seconds": 0.0025778330000001404, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.5076542499999999, + "ok": true, + "error": null + }, + "6_imputation_did_robustness": { + "seconds": 0.5523530000000001, + "ok": true, + "error": null + }, + "7_cs_without_covariates": { + "seconds": 0.12266958400000005, + "ok": true, + "error": null + }, + "8_practitioner_next_steps": { + "seconds": 4.233299999967244e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "large", + "n_units": 1500, + "n_periods": 26, + "n_cohorts": 3, + "n_obs": 39000, + "covariates": [ + "log_pop", + "baseline_spend" + ], + "n_bootstrap": 999, + "aggregate": "all", + "estimation_method": "dr" + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/campaign_staggered_medium_python.json b/benchmarks/speed_review/baselines/campaign_staggered_medium_python.json new file mode 100644 index 00000000..914a09aa --- /dev/null +++ b/benchmarks/speed_review/baselines/campaign_staggered_medium_python.json @@ -0,0 +1,71 @@ +{ + "scenario": "campaign_staggered_medium", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 0.7537883749999998, + "memory": { + "available": true, + "start_mb": 147.67, + "peak_mb": 226.62, + "growth_mb": 78.95, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_bacon_decomposition": { + "seconds": 0.012091666999999973, + "ok": true, + "error": null + }, + "2_cs_fit_with_covariates_bootstrap999": { + "seconds": 0.09575774999999997, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.9589999999135586e-06, + "ok": true, + "error": null + }, + "4_honest_did_M_grid": { + "seconds": 0.002356958999999881, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.276134208, + "ok": true, + "error": null + }, + "6_imputation_did_robustness": { + "seconds": 0.2946765, + "ok": true, + "error": null + }, + "7_cs_without_covariates": { + "seconds": 0.07270195899999998, + "ok": true, + "error": null + }, + "8_practitioner_next_steps": { + "seconds": 5.983399999998085e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_units": 500, + "n_periods": 26, + "n_cohorts": 3, + "n_obs": 13000, + "covariates": [ + "log_pop", + "baseline_spend" + ], + "n_bootstrap": 999, + "aggregate": "all", + "estimation_method": "dr" + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/campaign_staggered_medium_rust.json b/benchmarks/speed_review/baselines/campaign_staggered_medium_rust.json new file mode 100644 index 00000000..81c02255 --- /dev/null +++ b/benchmarks/speed_review/baselines/campaign_staggered_medium_rust.json @@ -0,0 +1,71 @@ +{ + "scenario": "campaign_staggered_medium", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.756008333, + "memory": { + "available": true, + "start_mb": 154.94, + "peak_mb": 254.11, + "growth_mb": 99.17, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_bacon_decomposition": { + "seconds": 0.012925999999999993, + "ok": true, + "error": null + }, + "2_cs_fit_with_covariates_bootstrap999": { + "seconds": 0.09863954099999983, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 3.1659999999433808e-06, + "ok": true, + "error": null + }, + "4_honest_did_M_grid": { + "seconds": 0.0024457499999999133, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.281516125, + "ok": true, + "error": null + }, + "6_imputation_did_robustness": { + "seconds": 0.29128733399999995, + "ok": true, + "error": null + }, + "7_cs_without_covariates": { + "seconds": 0.06915141700000005, + "ok": true, + "error": null + }, + "8_practitioner_next_steps": { + "seconds": 3.383300000003864e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_units": 500, + "n_periods": 26, + "n_cohorts": 3, + "n_obs": 13000, + "covariates": [ + "log_pop", + "baseline_spend" + ], + "n_bootstrap": 999, + "aggregate": "all", + "estimation_method": "dr" + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/campaign_staggered_small_python.json b/benchmarks/speed_review/baselines/campaign_staggered_small_python.json new file mode 100644 index 00000000..44e82483 --- /dev/null +++ b/benchmarks/speed_review/baselines/campaign_staggered_small_python.json @@ -0,0 +1,71 @@ +{ + "scenario": "campaign_staggered_small", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 0.509287875, + "memory": { + "available": true, + "start_mb": 114.72, + "peak_mb": 143.08, + "growth_mb": 28.36, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_bacon_decomposition": { + "seconds": 0.008488708000000011, + "ok": true, + "error": null + }, + "2_cs_fit_with_covariates_bootstrap999": { + "seconds": 0.06242541699999993, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 3.3329999999942572e-06, + "ok": true, + "error": null + }, + "4_honest_did_M_grid": { + "seconds": 0.00873587500000006, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.18465104099999996, + "ok": true, + "error": null + }, + "6_imputation_did_robustness": { + "seconds": 0.20897954100000016, + "ok": true, + "error": null + }, + "7_cs_without_covariates": { + "seconds": 0.03596216600000002, + "ok": true, + "error": null + }, + "8_practitioner_next_steps": { + "seconds": 3.28339999999816e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_units": 150, + "n_periods": 26, + "n_cohorts": 2, + "n_obs": 3900, + "covariates": [ + "log_pop", + "baseline_spend" + ], + "n_bootstrap": 999, + "aggregate": "all", + "estimation_method": "dr" + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/campaign_staggered_small_rust.json b/benchmarks/speed_review/baselines/campaign_staggered_small_rust.json new file mode 100644 index 00000000..bfe53aed --- /dev/null +++ b/benchmarks/speed_review/baselines/campaign_staggered_small_rust.json @@ -0,0 +1,71 @@ +{ + "scenario": "campaign_staggered_small", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.501876834, + "memory": { + "available": true, + "start_mb": 114.78, + "peak_mb": 150.67, + "growth_mb": 35.89, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_bacon_decomposition": { + "seconds": 0.0068224170000000806, + "ok": true, + "error": null + }, + "2_cs_fit_with_covariates_bootstrap999": { + "seconds": 0.06276566699999997, + "ok": true, + "error": null + }, + "3_inspect_pretrends": { + "seconds": 2.9160000000194586e-06, + "ok": true, + "error": null + }, + "4_honest_did_M_grid": { + "seconds": 0.004543957999999959, + "ok": true, + "error": null + }, + "5_sun_abraham_robustness": { + "seconds": 0.14964783299999995, + "ok": true, + "error": null + }, + "6_imputation_did_robustness": { + "seconds": 0.241357292, + "ok": true, + "error": null + }, + "7_cs_without_covariates": { + "seconds": 0.03669304200000001, + "ok": true, + "error": null + }, + "8_practitioner_next_steps": { + "seconds": 3.850000000005238e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_units": 150, + "n_periods": 26, + "n_cohorts": 2, + "n_obs": 3900, + "covariates": [ + "log_pop", + "baseline_spend" + ], + "n_bootstrap": 999, + "aggregate": "all", + "estimation_method": "dr" + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/dose_response_python.json b/benchmarks/speed_review/baselines/dose_response_python.json new file mode 100644 index 00000000..0e576e88 --- /dev/null +++ b/benchmarks/speed_review/baselines/dose_response_python.json @@ -0,0 +1,57 @@ +{ + "scenario": "dose_response", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 0.5912168340000001, + "memory": { + "available": true, + "start_mb": 114.11, + "peak_mb": 123.11, + "growth_mb": 9.0, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_cdid_cubic_spline_bootstrap199": { + "seconds": 0.15039274999999996, + "ok": true, + "error": null + }, + "2_extract_dose_response_dataframes": { + "seconds": 0.0007435829999999921, + "ok": true, + "error": null + }, + "3_cdid_event_study_pretrend": { + "seconds": 0.14597749999999998, + "ok": true, + "error": null + }, + "4_binarized_did_comparison": { + "seconds": 0.0017279590000000011, + "ok": true, + "error": null + }, + "5_spline_sensitivity_degree1": { + "seconds": 0.14600595799999994, + "ok": true, + "error": null + }, + "6_spline_sensitivity_num_knots2": { + "seconds": 0.14636520799999997, + "ok": true, + "error": null + } + }, + "metadata": { + "n_units": 500, + "n_periods": 6, + "n_bootstrap": 199, + "spline_configs": [ + "degree=3,k=1", + "degree=1,k=0", + "degree=3,k=2" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/dose_response_rust.json b/benchmarks/speed_review/baselines/dose_response_rust.json new file mode 100644 index 00000000..51039f15 --- /dev/null +++ b/benchmarks/speed_review/baselines/dose_response_rust.json @@ -0,0 +1,57 @@ +{ + "scenario": "dose_response", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.5952834579999999, + "memory": { + "available": true, + "start_mb": 113.73, + "peak_mb": 121.34, + "growth_mb": 7.61, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_cdid_cubic_spline_bootstrap199": { + "seconds": 0.15132816700000007, + "ok": true, + "error": null + }, + "2_extract_dose_response_dataframes": { + "seconds": 0.0007386659999999434, + "ok": true, + "error": null + }, + "3_cdid_event_study_pretrend": { + "seconds": 0.147476167, + "ok": true, + "error": null + }, + "4_binarized_did_comparison": { + "seconds": 0.001677958000000035, + "ok": true, + "error": null + }, + "5_spline_sensitivity_degree1": { + "seconds": 0.145152917, + "ok": true, + "error": null + }, + "6_spline_sensitivity_num_knots2": { + "seconds": 0.14890500000000007, + "ok": true, + "error": null + } + }, + "metadata": { + "n_units": 500, + "n_periods": 6, + "n_bootstrap": 199, + "spline_configs": [ + "degree=3,k=1", + "degree=1,k=0", + "degree=3,k=2" + ] + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/geo_few_markets_large_rust.json b/benchmarks/speed_review/baselines/geo_few_markets_large_rust.json new file mode 100644 index 00000000..dce42749 --- /dev/null +++ b/benchmarks/speed_review/baselines/geo_few_markets_large_rust.json @@ -0,0 +1,55 @@ +{ + "scenario": "geo_few_markets_large", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.26079429200000015, + "memory": { + "available": true, + "start_mb": 117.8, + "peak_mb": 118.22, + "growth_mb": 0.42, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_sdid_jackknife_variance": { + "seconds": 0.04102845799999999, + "ok": true, + "error": null + }, + "2_sdid_bootstrap_variance_200": { + "seconds": 0.03718729200000004, + "ok": true, + "error": null + }, + "3_in_time_placebo": { + "seconds": 0.07744412499999997, + "ok": true, + "error": null + }, + "4_get_loo_effects_df": { + "seconds": 0.0008073330000000212, + "ok": true, + "error": null + }, + "5_sensitivity_to_zeta_omega": { + "seconds": 0.10429091600000007, + "ok": true, + "error": null + }, + "6_weight_concentration": { + "seconds": 3.220799999992252e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "large", + "n_units": 500, + "n_pre": 6, + "n_post": 6, + "n_treated": 30, + "n_factors": 2 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/geo_few_markets_medium_python.json b/benchmarks/speed_review/baselines/geo_few_markets_medium_python.json new file mode 100644 index 00000000..868c0578 --- /dev/null +++ b/benchmarks/speed_review/baselines/geo_few_markets_medium_python.json @@ -0,0 +1,55 @@ +{ + "scenario": "geo_few_markets_medium", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 3.9883142080000002, + "memory": { + "available": true, + "start_mb": 143.86, + "peak_mb": 151.53, + "growth_mb": 7.67, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_sdid_jackknife_variance": { + "seconds": 0.35804470799999955, + "ok": true, + "error": null + }, + "2_sdid_bootstrap_variance_200": { + "seconds": 0.36447529099999976, + "ok": true, + "error": null + }, + "3_in_time_placebo": { + "seconds": 1.5563965419999999, + "ok": true, + "error": null + }, + "4_get_loo_effects_df": { + "seconds": 0.0007229159999999624, + "ok": true, + "error": null + }, + "5_sensitivity_to_zeta_omega": { + "seconds": 1.7086395420000002, + "ok": true, + "error": null + }, + "6_weight_concentration": { + "seconds": 2.9666999999733434e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_units": 200, + "n_pre": 6, + "n_post": 6, + "n_treated": 15, + "n_factors": 2 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/geo_few_markets_medium_rust.json b/benchmarks/speed_review/baselines/geo_few_markets_medium_rust.json new file mode 100644 index 00000000..bd4471a6 --- /dev/null +++ b/benchmarks/speed_review/baselines/geo_few_markets_medium_rust.json @@ -0,0 +1,55 @@ +{ + "scenario": "geo_few_markets_medium", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.118741875, + "memory": { + "available": true, + "start_mb": 117.23, + "peak_mb": 117.64, + "growth_mb": 0.41, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_sdid_jackknife_variance": { + "seconds": 0.020535375000000022, + "ok": true, + "error": null + }, + "2_sdid_bootstrap_variance_200": { + "seconds": 0.023519291000000053, + "ok": true, + "error": null + }, + "3_in_time_placebo": { + "seconds": 0.02495891699999997, + "ok": true, + "error": null + }, + "4_get_loo_effects_df": { + "seconds": 0.0006400839999999297, + "ok": true, + "error": null + }, + "5_sensitivity_to_zeta_omega": { + "seconds": 0.049061250000000056, + "ok": true, + "error": null + }, + "6_weight_concentration": { + "seconds": 2.31669999999351e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "medium", + "n_units": 200, + "n_pre": 6, + "n_post": 6, + "n_treated": 15, + "n_factors": 2 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/geo_few_markets_small_python.json b/benchmarks/speed_review/baselines/geo_few_markets_small_python.json new file mode 100644 index 00000000..e0bec083 --- /dev/null +++ b/benchmarks/speed_review/baselines/geo_few_markets_small_python.json @@ -0,0 +1,55 @@ +{ + "scenario": "geo_few_markets_small", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 3.697791375, + "memory": { + "available": true, + "start_mb": 114.09, + "peak_mb": 124.02, + "growth_mb": 9.92, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_sdid_jackknife_variance": { + "seconds": 0.593809709, + "ok": true, + "error": null + }, + "2_sdid_bootstrap_variance_200": { + "seconds": 0.584832209, + "ok": true, + "error": null + }, + "3_in_time_placebo": { + "seconds": 1.194314458, + "ok": true, + "error": null + }, + "4_get_loo_effects_df": { + "seconds": 0.0009036250000002966, + "ok": true, + "error": null + }, + "5_sensitivity_to_zeta_omega": { + "seconds": 1.3238487909999996, + "ok": true, + "error": null + }, + "6_weight_concentration": { + "seconds": 7.791699999959434e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_units": 80, + "n_pre": 6, + "n_post": 6, + "n_treated": 5, + "n_factors": 2 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/geo_few_markets_small_rust.json b/benchmarks/speed_review/baselines/geo_few_markets_small_rust.json new file mode 100644 index 00000000..855eac85 --- /dev/null +++ b/benchmarks/speed_review/baselines/geo_few_markets_small_rust.json @@ -0,0 +1,55 @@ +{ + "scenario": "geo_few_markets_small", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.04129770799999999, + "memory": { + "available": true, + "start_mb": 114.56, + "peak_mb": 116.05, + "growth_mb": 1.48, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_sdid_jackknife_variance": { + "seconds": 0.008074541000000046, + "ok": true, + "error": null + }, + "2_sdid_bootstrap_variance_200": { + "seconds": 0.012903124999999904, + "ok": true, + "error": null + }, + "3_in_time_placebo": { + "seconds": 0.008189833999999951, + "ok": true, + "error": null + }, + "4_get_loo_effects_df": { + "seconds": 0.0009220420000000118, + "ok": true, + "error": null + }, + "5_sensitivity_to_zeta_omega": { + "seconds": 0.01117779200000002, + "ok": true, + "error": null + }, + "6_weight_concentration": { + "seconds": 2.6250000000005436e-05, + "ok": true, + "error": null + } + }, + "metadata": { + "scale": "small", + "n_units": 80, + "n_pre": 6, + "n_post": 6, + "n_treated": 5, + "n_factors": 2 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/mem_profile_brfss_large_rust.txt b/benchmarks/speed_review/baselines/mem_profile_brfss_large_rust.txt new file mode 100644 index 00000000..1bc56a4e --- /dev/null +++ b/benchmarks/speed_review/baselines/mem_profile_brfss_large_rust.txt @@ -0,0 +1,30 @@ +# BRFSS-1M aggregate_survey allocation attribution +# backend: rust +# input microdata rows: 1,000,000 +# input microdata memory: 45.8 MB +# output panel cells: 500 + +# tracemalloc totals during aggregate_survey +# total net size diff across all sites: 0.5 MB +# top single-site size diff: 0.15 MB +# python peak traced: 84.2 MB +# python current retained: 0.6 MB + +# top 15 allocation sites by size delta +# size diff (MB) count diff location +-------------------------------------------------------------------------------- +1 0.15 1521 /lib/python3.9/linecache.py:148 +2 0.04 7 /.venv/lib/python3.9/site-packages/pandas/core/internals/blocks.py:822 +3 0.02 440 /.venv/lib/python3.9/site-packages/pandas/core/sorting.py:637 +4 0.01 64 /lib/python3.9/abc.py:123 +5 0.00 2 /.venv/lib/python3.9/site-packages/pandas/core/frame.py:12710 +6 0.00 2 /.venv/lib/python3.9/site-packages/pandas/core/frame.py:698 +7 0.00 16 /.venv/lib/python3.9/site-packages/pandas/core/indexes/base.py:5372 +8 0.00 51 /.venv/lib/python3.9/site-packages/pandas/core/groupby/ops.py:427 +9 0.00 55 /lib/python3.9/sre_parse.py:529 +10 0.00 43 /diff_diff/prep.py:1618 +11 0.00 2 /.venv/lib/python3.9/site-packages/pandas/core/internals/construction.py:237 +12 0.00 2 /.venv/lib/python3.9/site-packages/pandas/core/groupby/grouper.py:846 +13 0.00 2 /.venv/lib/python3.9/site-packages/pandas/core/construction.py:517 +14 0.00 2 /.venv/lib/python3.9/site-packages/pandas/core/indexes/base.py:475 +15 0.00 35 /.venv/lib/python3.9/site-packages/pandas/core/internals/managers.py:1500 diff --git a/benchmarks/speed_review/baselines/reversible_dcdh_python.json b/benchmarks/speed_review/baselines/reversible_dcdh_python.json new file mode 100644 index 00000000..1cbed394 --- /dev/null +++ b/benchmarks/speed_review/baselines/reversible_dcdh_python.json @@ -0,0 +1,45 @@ +{ + "scenario": "reversible_dcdh", + "backend": "python", + "has_rust_backend": false, + "total_seconds": 0.718732833, + "memory": { + "available": true, + "start_mb": 113.5, + "peak_mb": 135.02, + "growth_mb": 21.52, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_dcdh_fit_Lmax3_survey_TSL": { + "seconds": 0.3450735829999999, + "ok": true, + "error": null + }, + "2_inspect_placebo_and_summary": { + "seconds": 1.4160000000318362e-06, + "ok": true, + "error": null + }, + "3_honest_did_on_placebo": { + "seconds": 0.004985583999999932, + "ok": true, + "error": null + }, + "4_heterogeneity_refit": { + "seconds": 0.36866958299999986, + "ok": true, + "error": null + } + }, + "metadata": { + "n_groups": 120, + "n_periods": 10, + "pattern": "single_switch", + "L_max": 3, + "n_strata": 8, + "n_psu": 24 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/baselines/reversible_dcdh_rust.json b/benchmarks/speed_review/baselines/reversible_dcdh_rust.json new file mode 100644 index 00000000..2af530f5 --- /dev/null +++ b/benchmarks/speed_review/baselines/reversible_dcdh_rust.json @@ -0,0 +1,45 @@ +{ + "scenario": "reversible_dcdh", + "backend": "rust", + "has_rust_backend": true, + "total_seconds": 0.751090292, + "memory": { + "available": true, + "start_mb": 113.7, + "peak_mb": 134.89, + "growth_mb": 21.19, + "sampler_interval_s": 0.01 + }, + "phases": { + "1_dcdh_fit_Lmax3_survey_TSL": { + "seconds": 0.36838229199999994, + "ok": true, + "error": null + }, + "2_inspect_placebo_and_summary": { + "seconds": 1.3340000000194863e-06, + "ok": true, + "error": null + }, + "3_honest_did_on_placebo": { + "seconds": 0.005142916999999914, + "ok": true, + "error": null + }, + "4_heterogeneity_refit": { + "seconds": 0.3775615830000001, + "ok": true, + "error": null + } + }, + "metadata": { + "n_groups": 120, + "n_periods": 10, + "pattern": "single_switch", + "L_max": 3, + "n_strata": 8, + "n_psu": 24 + }, + "diff_diff_version": "3.1.3", + "numpy_version": "2.0.2" +} \ No newline at end of file diff --git a/benchmarks/speed_review/bench_brand_awareness_survey.py b/benchmarks/speed_review/bench_brand_awareness_survey.py new file mode 100644 index 00000000..c41d66b9 --- /dev/null +++ b/benchmarks/speed_review/bench_brand_awareness_survey.py @@ -0,0 +1,183 @@ +""" +Scenario 2: Brand awareness survey DiD - 2x2 with survey design. + +DifferenceInDifferences + SurveyDesign under two variance paths: + (a) analytical Taylor-series linearization (strata + PSU + FPC) + (b) replicate-weight variance (JK1 delete-one-PSU; count equals + the number of PSUs, so 40/90/160 at small/medium/large). + This is replicate-weight variance, not bootstrap resampling - + see REGISTRY.md for the distinction. + +Chains: naive fit (for SE-inflation comparison) -> TSL -> replicate -> multi- +outcome refit loop -> check_parallel_trends -> placebo -> HonestDiD grid. + +Three scales: + - small (200 units x 12 periods): Tutorial 17 analog + - medium (500 units x 12 periods): realistic CPG quarterly brand-tracking wave + - large (1000 units x 12 periods): multi-region brand tracking at scale +""" + +import numpy as np + +from diff_diff import ( + DifferenceInDifferences, + MultiPeriodDiD, + SurveyDesign, + check_parallel_trends, + compute_honest_did, +) +from diff_diff.prep import generate_survey_did_data + +from bench_shared import run_scenario + + +SCALES = { + "small": {"n_units": 200, "n_periods": 12, "n_strata": 10, "psu_per_stratum": 4}, + "medium": {"n_units": 500, "n_periods": 12, "n_strata": 15, "psu_per_stratum": 6}, + "large": {"n_units": 1000, "n_periods": 12, "n_strata": 20, "psu_per_stratum": 8}, +} + + +def build_data(n_units, n_periods, n_strata, psu_per_stratum, seed=42): + df = generate_survey_did_data( + n_units=n_units, n_periods=n_periods, cohort_periods=[7], + never_treated_frac=0.5, treatment_effect=2.0, + dynamic_effects=True, effect_growth=0.2, + n_strata=n_strata, psu_per_stratum=psu_per_stratum, + weight_variation="high", psu_re_sd=1.5, + include_replicate_weights=True, panel=True, seed=seed, + ) + rng = np.random.default_rng(seed + 1) + df["consideration"] = df["outcome"] + rng.normal(0, 0.4, size=len(df)) + df["purchase_intent"] = df["outcome"] * 0.6 + rng.normal(0, 0.3, size=len(df)) + df["post"] = (df["period"] >= 7).astype(int) + df["treat_unit"] = (df["first_treat"] > 0).astype(int) + return df + + +def make_phases(data, results, rw_cols): + # One analytical TSL SurveyDesign is reused across every analytical + # survey phase (TSL, multi-outcome, placebo, HonestDiD event-study). + # Keeping strata/PSU/FPC/nest constant is what the scenario spec and + # Tutorial 17 declare, and what the finite-population variance + # expressions require. The replicate-weight path (phase 3) is a + # different variance surface (JK1) that does not take FPC. + sd_tsl = SurveyDesign( + weights="weight", strata="stratum", psu="psu", + fpc="fpc", nest=True, + ) + + def naive_fit(): + # Truly naive comparison point: no survey design, no clustering - + # matches Tutorial 17's first pass where an analyst has not yet + # accounted for the sampling structure. The SE-inflation story + # only shows up if this step is as untreated-for-design as + # practitioners actually start. + did = DifferenceInDifferences(robust=True) + results["naive"] = did.fit( + data, outcome="outcome", treatment="treat_unit", time="post", + ) + + def tsl_fit(): + did = DifferenceInDifferences(robust=True) + results["tsl"] = did.fit( + data, outcome="outcome", treatment="treat_unit", time="post", + survey_design=sd_tsl, + ) + + def replicate_fit(): + if not rw_cols: + raise RuntimeError("replicate weights not generated") + sd = SurveyDesign( + weights="weight", replicate_weights=rw_cols, + replicate_method="JK1", + ) + did = DifferenceInDifferences(robust=True) + results["replicate"] = did.fit( + data, outcome="outcome", treatment="treat_unit", time="post", + survey_design=sd, + ) + + def multi_outcome_loop(): + out = {} + for y in ("outcome", "consideration", "purchase_intent"): + did = DifferenceInDifferences(robust=True) + out[y] = did.fit( + data, outcome=y, treatment="treat_unit", time="post", + survey_design=sd_tsl, + ) + results["multi_outcome"] = out + + def pretrends(): + results["pt"] = check_parallel_trends( + data, outcome="outcome", time="period", + treatment_group="treat_unit", + pre_periods=list(range(1, 7)), + ) + + def placebo_refit(): + pre = data[data["period"] < 7].copy() + pre["placebo_post"] = (pre["period"] >= 4).astype(int) + did = DifferenceInDifferences(robust=True) + results["placebo"] = did.fit( + pre, outcome="outcome", treatment="treat_unit", + time="placebo_post", survey_design=sd_tsl, + ) + + def honest_did_grid(): + es = MultiPeriodDiD() + es_result = es.fit( + data, outcome="outcome", treatment="treat_unit", + time="period", unit="unit", reference_period=6, + survey_design=sd_tsl, + ) + results["event_study"] = es_result + out = {} + for M in (0.5, 1.0, 1.5): + out[M] = compute_honest_did( + es_result, method="relative_magnitude", M=M, + ) + results["honest"] = out + + return [ + ("1_naive_fit_no_survey_design", naive_fit), + ("2_tsl_strata_psu_fpc", tsl_fit), + ("3_replicate_weights_jk1", replicate_fit), + ("4_multi_outcome_loop_3_metrics", multi_outcome_loop), + ("5_check_parallel_trends", pretrends), + ("6_placebo_refit_pre_period", placebo_refit), + ("7_event_study_plus_honest_did", honest_did_grid), + ] + + +def run_scale(scale, config): + data = build_data(**config) + rw_cols = [c for c in data.columns if c.startswith("rep_")] + results = {} + phases = make_phases(data, results, rw_cols) + + run_scenario( + f"brand_awareness_survey_{scale}", + phases, + metadata={ + "scale": scale, + "n_units": config["n_units"], + "n_periods": config["n_periods"], + "n_obs": int(len(data)), + "n_strata": config["n_strata"], + "n_psu_per_stratum": config["psu_per_stratum"], + "n_replicate_weights": len(rw_cols), + "outcomes": ["outcome", "consideration", "purchase_intent"], + }, + ) + + +def main(): + for scale, config in SCALES.items(): + print(f"\n{'='*60}\n brand_awareness_survey / scale={scale} " + f"(n_units={config['n_units']})\n{'='*60}") + run_scale(scale, config) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/bench_brfss_panel.py b/benchmarks/speed_review/bench_brfss_panel.py new file mode 100644 index 00000000..412401c8 --- /dev/null +++ b/benchmarks/speed_review/bench_brfss_panel.py @@ -0,0 +1,163 @@ +""" +Scenario 3: BRFSS-style microdata -> aggregate_survey -> CS panel. + +Chains: aggregate_survey (microdata -> state-year panel) -> CS fit with +stage-2 SurveyDesign + bootstrap at PSU -> event-study pre-trends -> +HonestDiD grid -> SunAbraham robustness refit -> practitioner_next_steps. + +Three scales, grounded in BRFSS 2024 (~458K records total): + - small (50K rows): single-year / single-state substudy slice + - medium (250K rows): multi-year multi-state analytic slice + - large (1M rows): pooled 10-year BRFSS-scale panel +""" + +import numpy as np +import pandas as pd + +from diff_diff import ( + CallawaySantAnna, + SunAbraham, + SurveyDesign, + aggregate_survey, + compute_honest_did, + practitioner_next_steps, +) + +from bench_shared import run_scenario + + +SCALES = { + "small": {"n_states": 50, "n_years": 10, "n_per_cell": 100, + "n_strata": 10, "n_psu": 200}, + "medium": {"n_states": 50, "n_years": 10, "n_per_cell": 500, + "n_strata": 15, "n_psu": 600}, + "large": {"n_states": 50, "n_years": 10, "n_per_cell": 2000, + "n_strata": 20, "n_psu": 1000}, +} + + +def build_microdata(n_states, n_years, n_per_cell, n_strata, n_psu, seed=42): + rng = np.random.default_rng(seed) + n_rows = n_states * n_years * n_per_cell + state = np.repeat(np.arange(n_states), n_years * n_per_cell) + year = np.tile( + np.repeat(np.arange(2010, 2010 + n_years), n_per_cell), + n_states, + ) + stratum = rng.integers(0, n_strata, size=n_rows) + psu = stratum * (n_psu // n_strata) + rng.integers( + 0, n_psu // n_strata, size=n_rows, + ) + weight = rng.lognormal(0, 0.4, size=n_rows) * 50.0 + + cohort_map = rng.choice( + [0, 2013, 2014, 2015, 2016, 2017], + size=n_states, + p=[0.4, 0.12, 0.12, 0.12, 0.12, 0.12], + ) + first_treat = cohort_map[state] + treated = (first_treat > 0) & (year >= first_treat) + y = ( + rng.normal(0, 1, size=n_rows) + + 0.5 * (year - 2010) + + 3.0 * treated.astype(float) + + rng.normal(0, 0.2, size=n_rows) * state + ) + return pd.DataFrame({ + "state": state, "year": year, + "strata": stratum, "psu": psu, "finalwt": weight, + "y": y, "first_treat": first_treat, + }) + + +def make_phases(micro, results): + def aggregate(): + sd = SurveyDesign( + weights="finalwt", strata="strata", psu="psu", + ) + panel, stage2 = aggregate_survey( + micro, by=["state", "year"], outcomes="y", + survey_design=sd, + ) + panel["first_treat"] = panel["state"].map( + micro.groupby("state")["first_treat"].first(), + ) + results["panel"] = panel + results["stage2"] = stage2 + + def cs_fit(): + cs = CallawaySantAnna( + control_group="never_treated", estimation_method="reg", + n_bootstrap=199, seed=123, + ) + results["cs"] = cs.fit( + results["panel"], outcome="y_mean", + unit="state", time="year", first_treat="first_treat", + survey_design=results["stage2"], aggregate="all", + ) + + def inspect_pretrends(): + es = results["cs"].event_study_effects or {} + results["pretrends"] = { + rel_t: eff for rel_t, eff in es.items() if rel_t < 0 + } + + def honest_grid(): + out = {} + for M in (0.5, 1.0, 1.5): + out[M] = compute_honest_did( + results["cs"], method="relative_magnitude", M=M, + ) + results["honest"] = out + + def sun_abraham(): + sa = SunAbraham(control_group="never_treated") + results["sa"] = sa.fit( + results["panel"], outcome="y_mean", unit="state", + time="year", first_treat="first_treat", + survey_design=results["stage2"], + ) + + def guidance(): + results["guidance"] = practitioner_next_steps(results["cs"]) + + return [ + ("1_aggregate_survey_microdata_to_panel", aggregate), + ("2_cs_fit_with_stage2_survey_design", cs_fit), + ("3_inspect_pretrends", inspect_pretrends), + ("4_honest_did_grid", honest_grid), + ("5_sun_abraham_robustness", sun_abraham), + ("6_practitioner_next_steps", guidance), + ] + + +def run_scale(scale, config): + micro = build_microdata(**config) + results = {} + phases = make_phases(micro, results) + run_scenario( + f"brfss_panel_{scale}", + phases, + metadata={ + "scale": scale, + "n_microdata_rows": int(len(micro)), + "n_states": int(micro["state"].nunique()), + "n_years": int(micro["year"].nunique()), + "n_strata": int(micro["strata"].nunique()), + "n_psu": int(micro["psu"].nunique()), + "n_bootstrap": 199, + }, + ) + + +def main(): + for scale, config in SCALES.items(): + n_rows = (config["n_states"] * config["n_years"] + * config["n_per_cell"]) + print(f"\n{'='*60}\n brfss_panel / scale={scale} " + f"({n_rows:,} microdata rows)\n{'='*60}") + run_scale(scale, config) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/bench_campaign_staggered.py b/benchmarks/speed_review/bench_campaign_staggered.py new file mode 100644 index 00000000..1e213858 --- /dev/null +++ b/benchmarks/speed_review/bench_campaign_staggered.py @@ -0,0 +1,154 @@ +""" +Scenario 1: Staggered marketing campaign. + +CallawaySantAnna with covariates + bootstrap + aggregate='all', wrapped in +the 8-step Baker workflow: Bacon -> CS fit -> event-study pre-trend +inspection -> HonestDiD M-grid -> SunAbraham + ImputationDiD robustness +-> with/without-covariates refit -> practitioner_next_steps. + +Three scales: + - small (150 units x 26 periods): Tutorial 02 / GeoLift DMA panel + - medium (500 units x 26 periods): pooled-DMA or multi-year sub-DMA + - large (1500 units x 26 periods): county-level staggered policy +""" + +import numpy as np +import pandas as pd + +from diff_diff import ( + BaconDecomposition, + CallawaySantAnna, + ImputationDiD, + SunAbraham, + compute_honest_did, + practitioner_next_steps, +) +from diff_diff.prep import generate_staggered_data + +from bench_shared import run_scenario + + +SCALES = { + "small": {"n_units": 150, "n_periods": 26, "cohort_periods": [9, 14]}, + "medium": {"n_units": 500, "n_periods": 26, "cohort_periods": [9, 14, 19]}, + "large": {"n_units": 1500, "n_periods": 26, "cohort_periods": [9, 14, 19]}, +} + + +def build_data(n_units, n_periods, cohort_periods, seed=42): + df = generate_staggered_data( + n_units=n_units, n_periods=n_periods, + cohort_periods=cohort_periods, + never_treated_frac=0.3, treatment_effect=3.0, + dynamic_effects=True, effect_growth=0.1, seed=seed, + ) + rng = np.random.default_rng(seed + 1) + unit_log_pop = pd.Series( + rng.normal(0, 1, size=df["unit"].nunique()), + index=sorted(df["unit"].unique()), + ) + df["log_pop"] = df["unit"].map(unit_log_pop) + df["baseline_spend"] = rng.normal(0, 1, size=len(df)) + return df + + +def make_phases(data, results, covars, fit_kwargs): + def bacon(): + results["bacon"] = BaconDecomposition().fit( + data, outcome="outcome", unit="unit", time="period", + first_treat="first_treat", + ) + + def cs_fit(): + cs = CallawaySantAnna( + control_group="never_treated", estimation_method="dr", + cluster="unit", n_bootstrap=999, seed=123, + ) + results["cs"] = cs.fit( + **fit_kwargs, covariates=covars, aggregate="all", + ) + + def inspect_pretrends(): + es = results["cs"].event_study_effects or {} + results["pretrends"] = { + rel_t: eff for rel_t, eff in es.items() if rel_t < 0 + } + + def honest_did_grid(): + out = {} + for M in (0.5, 1.0, 1.5, 2.0): + out[M] = compute_honest_did( + results["cs"], method="relative_magnitude", M=M, + ) + results["honest"] = out + + def sun_abraham(): + sa = SunAbraham(control_group="never_treated", cluster="unit") + results["sa"] = sa.fit(**fit_kwargs) + + def imputation(): + bjs = ImputationDiD(cluster="unit") + results["bjs"] = bjs.fit(**fit_kwargs, aggregate="event_study") + + def cs_no_covariates(): + # Match phase 2's estimator config exactly; the only axis that + # varies is `covariates`. This is the Baker-mandated with/without + # comparison - holding inference workload constant is the whole + # point of the comparison. + cs = CallawaySantAnna( + control_group="never_treated", estimation_method="dr", + cluster="unit", n_bootstrap=999, seed=123, + ) + results["cs_nocov"] = cs.fit(**fit_kwargs, aggregate="all") + + def next_steps(): + results["guidance"] = practitioner_next_steps(results["cs"]) + + return [ + ("1_bacon_decomposition", bacon), + ("2_cs_fit_with_covariates_bootstrap999", cs_fit), + ("3_inspect_pretrends", inspect_pretrends), + ("4_honest_did_M_grid", honest_did_grid), + ("5_sun_abraham_robustness", sun_abraham), + ("6_imputation_did_robustness", imputation), + ("7_cs_without_covariates", cs_no_covariates), + ("8_practitioner_next_steps", next_steps), + ] + + +def run_scale(scale, config): + data = build_data(**config) + covars = ["log_pop", "baseline_spend"] + fit_kwargs = dict( + data=data, outcome="outcome", unit="unit", time="period", + first_treat="first_treat", + ) + results = {} + phases = make_phases(data, results, covars, fit_kwargs) + + run_scenario( + f"campaign_staggered_{scale}", + phases, + metadata={ + "scale": scale, + "n_units": config["n_units"], + "n_periods": config["n_periods"], + "n_cohorts": len(config["cohort_periods"]), + "n_obs": int(len(data)), + "covariates": covars, + "n_bootstrap": 999, + "aggregate": "all", + "estimation_method": "dr", + }, + ) + + +def main(): + for scale, config in SCALES.items(): + print(f"\n{'='*60}\n campaign_staggered / scale={scale} " + f"(n_units={config['n_units']})\n{'='*60}") + run_scale(scale, config) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/bench_dose_response.py b/benchmarks/speed_review/bench_dose_response.py new file mode 100644 index 00000000..9dd38765 --- /dev/null +++ b/benchmarks/speed_review/bench_dose_response.py @@ -0,0 +1,123 @@ +""" +Scenario 6: Pricing dose-response with ContinuousDiD cubic spline. + +Chains: CDiD fit with aggregate='dose' (overall ATT + ACRT + dose-response +curves + bootstrap 199) -> dataframe extraction -> event-study pre-trend -> +binarized-DiD comparison -> spline sensitivity (degree=1, num_knots=2). + +Data shape: 500 stores x 6 quarterly periods, 1 cohort at period 3, +log-normal dose. Matches Tutorial 14 scaled from 200 to 500 units. +""" + +import numpy as np +import pandas as pd + +from diff_diff import ContinuousDiD, DifferenceInDifferences +from diff_diff.prep import generate_continuous_did_data + +from bench_shared import run_scenario + + +def build_data(seed=42): + # cohort_periods=[3] pins the single treated cohort to period 3 to + # match the documented scenario shape. The generator default would + # be period 2, which would desync this scenario from the spec in + # docs/performance-scenarios.md and from the binarized DiD + # comparison phase below. + df = generate_continuous_did_data( + n_units=500, n_periods=6, cohort_periods=[3], seed=seed, + ) + positive_first_treat = sorted( + v for v in df["first_treat"].unique() if v > 0 + ) + assert len(positive_first_treat) == 1, ( + f"dose-response scenario expects exactly one treated cohort; " + f"got first_treat values {positive_first_treat}" + ) + return df + + +def main(): + data = build_data() + + results = {} + fit_kwargs = dict( + data=data, outcome="outcome", unit="unit", time="period", + first_treat="first_treat", dose="dose", + ) + + def cdid_cubic_fit(): + cdid = ContinuousDiD( + degree=3, num_knots=1, n_bootstrap=199, seed=123, + ) + results["cubic"] = cdid.fit(**fit_kwargs, aggregate="dose") + + def extract_curves(): + # The cubic fit used aggregate="dose", so only dose-response and + # group-time levels are available on the result. Event-study is + # extracted separately in the dedicated pretrend phase below. + # NB: ContinuousDiD uses 'eventstudy' for fit(aggregate=...) but + # 'event_study' for to_dataframe(level=...). Two different + # spellings within one estimator - flagged in performance-plan.md. + r = results["cubic"] + out = {} + for level in ("dose_response", "group_time"): + out[level] = r.to_dataframe(level=level) + results["curves"] = out + + def cdid_event_study(): + cdid = ContinuousDiD( + degree=3, num_knots=1, n_bootstrap=0, seed=123, + ) + results["event_study"] = cdid.fit( + **fit_kwargs, aggregate="eventstudy", + ) + + def binarized_comparison(): + # Derive post from the actual first_treat cohort in the data so + # this phase is aligned with the CDiD fits above. A hardcoded + # period cutoff would silently desync if the DGP cohort moves. + treated_cohort = int( + sorted(v for v in data["first_treat"].unique() if v > 0)[0] + ) + data_bin = data.copy() + data_bin["treated_any"] = (data_bin["dose"] > 0).astype(int) + data_bin["post"] = (data_bin["period"] >= treated_cohort).astype(int) + did = DifferenceInDifferences(robust=True) + results["binarized"] = did.fit( + data_bin, outcome="outcome", treatment="treated_any", time="post", + ) + + def spline_sensitivity_linear(): + cdid = ContinuousDiD( + degree=1, num_knots=0, n_bootstrap=199, seed=123, + ) + results["linear"] = cdid.fit(**fit_kwargs, aggregate="dose") + + def spline_sensitivity_more_knots(): + cdid = ContinuousDiD( + degree=3, num_knots=2, n_bootstrap=199, seed=123, + ) + results["many_knots"] = cdid.fit(**fit_kwargs, aggregate="dose") + + phases = [ + ("1_cdid_cubic_spline_bootstrap199", cdid_cubic_fit), + ("2_extract_dose_response_dataframes", extract_curves), + ("3_cdid_event_study_pretrend", cdid_event_study), + ("4_binarized_did_comparison", binarized_comparison), + ("5_spline_sensitivity_degree1", spline_sensitivity_linear), + ("6_spline_sensitivity_num_knots2", spline_sensitivity_more_knots), + ] + + run_scenario( + "dose_response", + phases, + metadata={ + "n_units": 500, "n_periods": 6, "n_bootstrap": 199, + "spline_configs": ["degree=3,k=1", "degree=1,k=0", "degree=3,k=2"], + }, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/bench_geo_few_markets.py b/benchmarks/speed_review/bench_geo_few_markets.py new file mode 100644 index 00000000..36575892 --- /dev/null +++ b/benchmarks/speed_review/bench_geo_few_markets.py @@ -0,0 +1,135 @@ +""" +Scenario 4: Geo-experiment with few treated markets (SyntheticDiD). + +Chains: SDiD with jackknife variance (N LOO refits) -> SDiD with bootstrap +variance for SE comparison -> in_time_placebo -> get_loo_effects_df -> +sensitivity_to_zeta_omega -> weight-concentration diagnostic. + +Three scales: + - small (80 units, 5 treated): Tutorial 18 DMA panel + - medium (200 units, 15 treated): zip-cluster or large geo-experiment + - large (500 units, 30 treated): zip-level or multi-market at scale + (Python backend skipped at this scale; + Python FW solver scales poorly) + +The python backend is skipped at "large" because the pure-numpy Frank-Wolfe +solver plus jackknife (500 LOO refits x ~0.5s each) would take tens of +minutes without providing additional signal; the medium scale already +establishes the Python-vs-Rust gap. +""" + +import os + +from diff_diff import SyntheticDiD +from diff_diff.prep import generate_factor_data + +from bench_shared import run_scenario + + +SCALES = { + "small": {"n_units": 80, "n_pre": 6, "n_post": 6, "n_treated": 5}, + "medium": {"n_units": 200, "n_pre": 6, "n_post": 6, "n_treated": 15}, + "large": {"n_units": 500, "n_pre": 6, "n_post": 6, "n_treated": 30}, +} +SKIP_PYTHON_AT = {"large"} + + +def build_data(n_units, n_pre, n_post, n_treated, seed=42): + return generate_factor_data( + n_units=n_units, n_pre=n_pre, n_post=n_post, n_treated=n_treated, + n_factors=2, treatment_effect=2.0, + factor_strength=1.0, treated_loading_shift=0.5, + seed=seed, + ) + + +def make_phases(data, post_periods, results): + def sdid_jackknife(): + sdid = SyntheticDiD(variance_method="jackknife", seed=123) + results["jk"] = sdid.fit( + data, outcome="outcome", unit="unit", time="period", + treatment="treat", post_periods=post_periods, + ) + + def sdid_bootstrap(): + sdid = SyntheticDiD( + variance_method="bootstrap", n_bootstrap=200, seed=123, + ) + results["bs"] = sdid.fit( + data, outcome="outcome", unit="unit", time="period", + treatment="treat", post_periods=post_periods, + ) + + def in_time_placebo(): + fn = getattr(results["jk"], "in_time_placebo", None) + if fn is None: + raise RuntimeError("in_time_placebo not available on results") + results["in_time"] = fn() + + def loo_effects_df(): + fn = getattr(results["jk"], "get_loo_effects_df", None) + if fn is None: + raise RuntimeError("get_loo_effects_df not available") + results["loo"] = fn() + + def sensitivity_zeta_omega(): + fn = getattr(results["jk"], "sensitivity_to_zeta_omega", None) + if fn is None: + raise RuntimeError("sensitivity_to_zeta_omega not available") + results["zeta"] = fn() + + def weight_concentration(): + fn = getattr(results["jk"], "get_weight_concentration", None) + if fn is None: + raise RuntimeError("get_weight_concentration not available") + results["wc"] = fn() + + return [ + ("1_sdid_jackknife_variance", sdid_jackknife), + ("2_sdid_bootstrap_variance_200", sdid_bootstrap), + ("3_in_time_placebo", in_time_placebo), + ("4_get_loo_effects_df", loo_effects_df), + ("5_sensitivity_to_zeta_omega", sensitivity_zeta_omega), + ("6_weight_concentration", weight_concentration), + ] + + +def run_scale(scale, config): + backend_env = os.environ.get("DIFF_DIFF_BACKEND", "auto").lower() + if scale in SKIP_PYTHON_AT and backend_env == "python": + print(f" [skip] geo_few_markets/{scale} backend=python " + f"(Python FW solver scales poorly)") + return + + data = build_data(**config) + post_periods = sorted( + data.loc[(data["treat"] == 1) & (data["treated"] == 1), + "period"].unique().tolist(), + ) + results = {} + phases = make_phases(data, post_periods, results) + + run_scenario( + f"geo_few_markets_{scale}", + phases, + metadata={ + "scale": scale, + "n_units": config["n_units"], + "n_pre": config["n_pre"], + "n_post": config["n_post"], + "n_treated": config["n_treated"], + "n_factors": 2, + }, + ) + + +def main(): + for scale, config in SCALES.items(): + print(f"\n{'='*60}\n geo_few_markets / scale={scale} " + f"(n_units={config['n_units']}, " + f"n_treated={config['n_treated']})\n{'='*60}") + run_scale(scale, config) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/bench_reversible_dcdh.py b/benchmarks/speed_review/bench_reversible_dcdh.py new file mode 100644 index 00000000..5788b02c --- /dev/null +++ b/benchmarks/speed_review/bench_reversible_dcdh.py @@ -0,0 +1,115 @@ +""" +Scenario 5: Reversible treatment with dCDH, L_max multi-horizon, survey TSL. + +Chains: dCDH fit with L_max=3 (multi-horizon DID_l + dynamic placebos + +sup-t bands + TWFE diagnostic + survey TSL) -> inspect placebo -> +compute_honest_did on placebo event study -> heterogeneity refit. + +Data shape: 120 groups x 10 periods, single-switch reversible pattern, +survey-weighted with 8 strata and 24 PSUs. +""" + +import numpy as np +import pandas as pd + +from diff_diff import ( + ChaisemartinDHaultfoeuille, + SurveyDesign, + compute_honest_did, +) +from diff_diff.prep import generate_reversible_did_data + +from bench_shared import run_scenario + + +def attach_survey_columns(df, seed=42, n_strata=8, psu_per_stratum=3): + rng = np.random.default_rng(seed) + groups = sorted(df["group"].unique()) + n_groups = len(groups) + stratum_map = {g: i % n_strata for i, g in enumerate(groups)} + psu_map = { + g: stratum_map[g] * psu_per_stratum + (i // n_strata) % psu_per_stratum + for i, g in enumerate(groups) + } + weight_map = { + g: float(rng.lognormal(0, 0.3)) for g in groups + } + df = df.copy() + df["stratum"] = df["group"].map(stratum_map) + df["psu"] = df["group"].map(psu_map) + df["pw"] = df["group"].map(weight_map) + return df + + +def main(): + raw = generate_reversible_did_data( + n_groups=120, n_periods=10, pattern="single_switch", + initial_treat_frac=0.3, p_switch=0.15, + treatment_effect=2.0, heterogeneous_effects=True, + seed=42, + ) + data = attach_survey_columns(raw) + + results = {} + fit_kwargs = dict( + data=data, outcome="outcome", group="group", time="period", + treatment="treatment", + ) + sd = SurveyDesign( + weights="pw", strata="stratum", psu="psu", + ) + + def dcdh_fit_lmax3(): + est = ChaisemartinDHaultfoeuille(seed=123) + results["dcdh"] = est.fit( + **fit_kwargs, L_max=3, survey_design=sd, + ) + + def inspect_placebo(): + r = results["dcdh"] + results["placebo_summary"] = { + "placebo_effect": getattr(r, "placebo_effect", None), + "overall_att": getattr(r, "overall_att", None), + "joiners_att": getattr(r, "joiners_att", None), + "leavers_att": getattr(r, "leavers_att", None), + } + + def honest_placebo(): + out = {} + for M in (0.5, 1.0, 1.5): + out[M] = compute_honest_did( + results["dcdh"], method="relative_magnitude", M=M, + ) + results["honest"] = out + + def heterogeneity_refit(): + # Use the same SurveyDesign as the main fit; the scenario framing + # is the survey-TSL workflow, and the TSL-sharing optimization + # conclusion in performance-plan.md depends on both fits running + # under the same survey design. + est = ChaisemartinDHaultfoeuille(seed=123) + results["het"] = est.fit( + **fit_kwargs, L_max=3, survey_design=sd, heterogeneity="group", + ) + + phases = [ + ("1_dcdh_fit_Lmax3_survey_TSL", dcdh_fit_lmax3), + ("2_inspect_placebo_and_summary", inspect_placebo), + ("3_honest_did_on_placebo", honest_placebo), + ("4_heterogeneity_refit", heterogeneity_refit), + ] + + run_scenario( + "reversible_dcdh", + phases, + metadata={ + "n_groups": 120, "n_periods": 10, + "pattern": "single_switch", "L_max": 3, + "n_strata": int(data["stratum"].nunique()), + "n_psu": int(data["psu"].nunique()), + }, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/bench_shared.py b/benchmarks/speed_review/bench_shared.py new file mode 100644 index 00000000..4f50fdc2 --- /dev/null +++ b/benchmarks/speed_review/bench_shared.py @@ -0,0 +1,258 @@ +""" +Shared harness for the practitioner-workflow performance scenarios. + +Each ``bench_.py`` script imports ``run_scenario`` and hands it a +list of phases (label, callable). The harness times each phase, wraps the +full chain in a pyinstrument profile, and writes: + +- ``benchmarks/speed_review/baselines/_.json`` - per-phase wall-clock +- ``benchmarks/speed_review/baselines/profiles/_.html`` - flame profile + +If any phase raises, the exception is caught and recorded as +``{"ok": false}`` in the per-phase JSON, AND the process exits 1 after +artifacts are written so that ``run_all.py`` and CI can detect the failure. + +Backend is auto-detected via ``diff_diff._backend.HAS_RUST_BACKEND`` and the +``DIFF_DIFF_BACKEND`` env var. Run each script twice - once with +``DIFF_DIFF_BACKEND=python`` and once with ``DIFF_DIFF_BACKEND=rust`` - to +populate both files. + +See ``docs/performance-scenarios.md`` for scenario definitions and +``docs/performance-plan.md`` for the per-scenario findings and action +recommendations derived from these results. +""" + +import atexit +import json +import os +import sys +import threading +import time +import warnings +from pathlib import Path + +import numpy as np + +try: + from pyinstrument import Profiler + HAS_PYINSTRUMENT = True +except ImportError: + HAS_PYINSTRUMENT = False + Profiler = None # type: ignore[assignment,misc] + +try: + import psutil + HAS_PSUTIL = True +except ImportError: + HAS_PSUTIL = False + psutil = None # type: ignore[assignment] + + +class _RSSSampler: + """Background thread that samples process RSS every ~10ms. + + Gives per-scenario peak memory without depending on + `resource.getrusage(RUSAGE_SELF).ru_maxrss` (which is monotonic across + the whole process and so would leak scale-1 peaks into scale-2 reports + in multi-scale scripts). If psutil is missing, the sampler reports + peak=0 and the caller falls back to not recording memory. + """ + + def __init__(self, interval_s=0.01): + self.interval = interval_s + self.peak_bytes = 0 + self.start_bytes = 0 + self._stop = threading.Event() + self._thread = None + self._proc = psutil.Process() if HAS_PSUTIL else None + + def start(self): + if self._proc is None: + return + self.start_bytes = self._proc.memory_info().rss + self.peak_bytes = self.start_bytes + self._stop.clear() + + def sample(): + while not self._stop.is_set(): + try: + rss = self._proc.memory_info().rss + if rss > self.peak_bytes: + self.peak_bytes = rss + except Exception: + pass + self._stop.wait(self.interval) + + self._thread = threading.Thread(target=sample, daemon=True) + self._thread.start() + + def stop(self): + if self._proc is None: + return + self._stop.set() + if self._thread: + self._thread.join(timeout=0.2) + + @property + def peak_mb(self): + return self.peak_bytes / (1024 * 1024) + + @property + def start_mb(self): + return self.start_bytes / (1024 * 1024) + +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from diff_diff._backend import HAS_RUST_BACKEND + +RESULTS_DIR = Path(__file__).resolve().parent / "baselines" +PROFILE_DIR = RESULTS_DIR / "profiles" + +# Module-level failure flag. Set True whenever run_scenario sees any phase +# record ok=False. atexit handler below translates this into a nonzero +# process exit code so run_all.py and CI can detect partial-failure runs. +# Multi-scale scripts still complete all scales before the process exits. +_any_phase_failed = False + + +def _exit_with_failure_status(): + if _any_phase_failed: + print( + "\n [bench_shared] at least one phase failed; " + "exiting nonzero", file=sys.stderr, + ) + os._exit(1) + + +atexit.register(_exit_with_failure_status) + + +def _backend_label(): + """Return 'rust' or 'python' for file naming.""" + env = os.environ.get("DIFF_DIFF_BACKEND", "auto").lower() + if env == "python": + return "python" + if env == "rust": + return "rust" + return "rust" if HAS_RUST_BACKEND else "python" + + +def run_scenario(scenario_name, phases, metadata=None): + """Time a list of phases and write JSON + pyinstrument profile. + + Parameters + ---------- + scenario_name : str + Filename stem, e.g. ``"campaign_staggered"``. Output files use + ``_.(json|html)``. + phases : list of (label, callable) tuples + Each callable takes no arguments and may return a value that is + passed forward via a shared ``context`` dict — but for simplicity + phases are independent here; each callable captures what it needs + from its enclosing scope. + metadata : dict, optional + Extra fields folded into the JSON under ``metadata`` (data shape, + params, etc.). Pure data, no callables. + """ + backend = _backend_label() + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + PROFILE_DIR.mkdir(parents=True, exist_ok=True) + + warnings.filterwarnings( + "ignore", message=".*invalid value encountered in matmul.*", + category=RuntimeWarning, + ) + + profile = None + if HAS_PYINSTRUMENT: + profile = Profiler(async_mode="disabled") + profile.start() + + sampler = _RSSSampler() + sampler.start() + + phase_times = {} + total_start = time.perf_counter() + try: + for label, fn in phases: + t0 = time.perf_counter() + try: + fn() + phase_times[label] = { + "seconds": time.perf_counter() - t0, + "ok": True, + "error": None, + } + except Exception as e: + phase_times[label] = { + "seconds": time.perf_counter() - t0, + "ok": False, + "error": f"{type(e).__name__}: {e}", + } + print(f" [{label}] FAILED: {type(e).__name__}: {e}") + finally: + total_elapsed = time.perf_counter() - total_start + sampler.stop() + if profile is not None: + profile.stop() + html_path = PROFILE_DIR / f"{scenario_name}_{backend}.html" + with open(html_path, "w") as f: + f.write(profile.output_html()) + repo_root = Path(__file__).resolve().parents[2] + print(f" profile -> {html_path.relative_to(repo_root)}") + + memory = { + "available": HAS_PSUTIL, + "start_mb": round(sampler.start_mb, 2) if HAS_PSUTIL else None, + "peak_mb": round(sampler.peak_mb, 2) if HAS_PSUTIL else None, + "growth_mb": ( + round(sampler.peak_mb - sampler.start_mb, 2) + if HAS_PSUTIL else None + ), + "sampler_interval_s": sampler.interval, + } + + record = { + "scenario": scenario_name, + "backend": backend, + "has_rust_backend": HAS_RUST_BACKEND, + "total_seconds": total_elapsed, + "memory": memory, + "phases": phase_times, + "metadata": metadata or {}, + "diff_diff_version": _get_version(), + "numpy_version": np.__version__, + } + + json_path = RESULTS_DIR / f"{scenario_name}_{backend}.json" + with open(json_path, "w") as f: + json.dump(record, f, indent=2, default=str) + + mem_str = ( + f" peak_rss={memory['peak_mb']:.0f}MB " + f"(+{memory['growth_mb']:.0f}MB during run)" + if HAS_PSUTIL else " [no psutil; skipping memory]" + ) + print( + f"\n [{scenario_name}] backend={backend} " + f"total={total_elapsed:.2f}s{mem_str}" + ) + for label, info in phase_times.items(): + status = "OK " if info["ok"] else "ERR" + print(f" {status} {label:<40} {info['seconds']:>8.3f}s") + repo_root = Path(__file__).resolve().parents[2] + print(f" json -> {json_path.relative_to(repo_root)}") + + if any(not info["ok"] for info in phase_times.values()): + global _any_phase_failed + _any_phase_failed = True + + return record + + +def _get_version(): + try: + import diff_diff + return diff_diff.__version__ + except Exception: + return "unknown" diff --git a/benchmarks/speed_review/gen_findings_tables.py b/benchmarks/speed_review/gen_findings_tables.py new file mode 100644 index 00000000..e1258094 --- /dev/null +++ b/benchmarks/speed_review/gen_findings_tables.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +""" +Regenerate the numerical tables in ``docs/performance-plan.md`` from the +committed JSON baselines under ``benchmarks/speed_review/baselines/``. + +Each auto-generated table is bounded by a pair of HTML-comment markers in +the target markdown file: + + + ... (rendered table body lives here; overwritten on every run) ... + + +Run this after any benchmark rerun; the doc tables then re-derive exactly +from the JSON baselines, removing the possibility of hand-edit drift. + +Tables owned by this generator: + - scale_sweep_totals end-to-end wall-clock per scenario + scale + - memory_by_scenario peak RSS + growth per scenario + scale + - top_phases_by_scenario largest-scale phase-level timing ranking + +Narrative prose in the doc is hand-written and not touched. If numerical +claims in narrative drift from the regenerated tables, the reviewer must +update the narrative manually - by design, to force a human read of the +findings whenever numbers shift meaningfully. +""" + +import json +import re +from pathlib import Path +from textwrap import dedent + +HERE = Path(__file__).resolve().parent +BASELINES = HERE / "baselines" +PLAN_MD = HERE.parent.parent / "docs" / "performance-plan.md" + +SCALE_ORDER = ("small", "medium", "large") +MULTI_SCALE = ( + "campaign_staggered", + "brand_awareness_survey", + "brfss_panel", + "geo_few_markets", +) +SINGLE_SCALE = ("reversible_dcdh", "dose_response") + +SCENARIO_DISPLAY = { + "campaign_staggered": "1. Staggered campaign", + "brand_awareness_survey": "2. Brand awareness survey", + "brfss_panel": "3. BRFSS microdata -> CS panel", + "geo_few_markets": "4. SDiD few markets", + "reversible_dcdh": "5. Reversible dCDH", + "dose_response": "6. Pricing dose-response", +} + + +def load(scenario, scale, backend): + if scale is None: + path = BASELINES / f"{scenario}_{backend}.json" + else: + path = BASELINES / f"{scenario}_{scale}_{backend}.json" + if not path.exists(): + return None + return json.loads(path.read_text()) + + +def fmt_secs(x): + return f"{x:.2f}" if x is not None else "skip" + + +def fmt_mb(x): + return f"{x:.0f}" if x is not None else "skip" + + +def render_scale_sweep_totals(): + rows = [ + "| Scenario | Scale | Python (s) | Rust (s) | Py/Rust |", + "|---|---|---:|---:|---:|", + ] + for scen in MULTI_SCALE: + display = SCENARIO_DISPLAY[scen] + first = True + for scale in SCALE_ORDER: + py = load(scen, scale, "python") + rs = load(scen, scale, "rust") + py_t = py["total_seconds"] if py else None + rs_t = rs["total_seconds"] if rs else None + ratio = ( + f"{py_t/rs_t:.1f}x" + if (py_t is not None and rs_t is not None and rs_t > 0) + else "-" + ) + name_col = display if first else "" + first = False + rows.append( + f"| {name_col} | {scale} | " + f"{fmt_secs(py_t)} | {fmt_secs(rs_t)} | {ratio} |" + ) + for scen in SINGLE_SCALE: + display = SCENARIO_DISPLAY[scen] + py = load(scen, None, "python") + rs = load(scen, None, "rust") + py_t = py["total_seconds"] if py else None + rs_t = rs["total_seconds"] if rs else None + ratio = ( + f"{py_t/rs_t:.1f}x" + if (py_t is not None and rs_t is not None and rs_t > 0) + else "-" + ) + rows.append( + f"| {display} | single | " + f"{fmt_secs(py_t)} | {fmt_secs(rs_t)} | {ratio} |" + ) + return "\n".join(rows) + + +def render_memory_by_scenario(): + rows = [ + "| Scenario | Scale | Py peak RSS (MB) | Py growth (MB) | " + "Rust peak RSS (MB) | Rust growth (MB) |", + "|---|---|---:|---:|---:|---:|", + ] + for scen in MULTI_SCALE: + display = SCENARIO_DISPLAY[scen] + first = True + for scale in SCALE_ORDER: + py = load(scen, scale, "python") + rs = load(scen, scale, "rust") + py_peak = py["memory"]["peak_mb"] if py else None + py_growth = py["memory"]["growth_mb"] if py else None + rs_peak = rs["memory"]["peak_mb"] if rs else None + rs_growth = rs["memory"]["growth_mb"] if rs else None + name_col = display if first else "" + first = False + rows.append( + f"| {name_col} | {scale} | " + f"{fmt_mb(py_peak)} | {fmt_mb(py_growth)} | " + f"{fmt_mb(rs_peak)} | {fmt_mb(rs_growth)} |" + ) + for scen in SINGLE_SCALE: + display = SCENARIO_DISPLAY[scen] + py = load(scen, None, "python") + rs = load(scen, None, "rust") + py_peak = py["memory"]["peak_mb"] if py else None + py_growth = py["memory"]["growth_mb"] if py else None + rs_peak = rs["memory"]["peak_mb"] if rs else None + rs_growth = rs["memory"]["growth_mb"] if rs else None + rows.append( + f"| {display} | single | " + f"{fmt_mb(py_peak)} | {fmt_mb(py_growth)} | " + f"{fmt_mb(rs_peak)} | {fmt_mb(rs_growth)} |" + ) + return "\n".join(rows) + + +def render_top_phases_by_scenario(): + """Top-3 phases at the largest-available scale per (scenario, backend). + + If a scenario/backend skips `large` (e.g., geo_few_markets Python), + this falls back to the largest measured scale for that backend so + the table still reports the Python-vs-Rust comparison rather than + dropping the row entirely. + """ + rows = [ + "| Scenario | Scale | Backend | Top phase (%) " + "| 2nd phase (%) | 3rd phase (%) |", + "|---|---|---|---|---|---|", + ] + + def phase_rank(record, n=3): + if record is None: + return [] + total = record["total_seconds"] + phases = sorted( + record["phases"].items(), + key=lambda kv: -kv[1]["seconds"], + ) + out = [] + for label, info in phases[:n]: + pct = 100 * info["seconds"] / total if total > 0 else 0 + out.append(f"`{label}` ({pct:.0f}%)") + while len(out) < n: + out.append("-") + return out + + def largest_available(scen, backend): + """Return (scale, record) for the largest scale this backend has.""" + for scale in reversed(SCALE_ORDER): + rec = load(scen, scale, backend) + if rec is not None: + return scale, rec + return None, None + + for scen in MULTI_SCALE: + display = SCENARIO_DISPLAY[scen] + for backend in ("python", "rust"): + scale, rec = largest_available(scen, backend) + top = phase_rank(rec) + if not top: + continue + rows.append( + f"| {display} | {scale} | {backend} | " + f"{top[0]} | {top[1]} | {top[2]} |" + ) + for scen in SINGLE_SCALE: + display = SCENARIO_DISPLAY[scen] + for backend in ("python", "rust"): + rec = load(scen, None, backend) + top = phase_rank(rec) + if not top: + continue + rows.append( + f"| {display} | single | {backend} | " + f"{top[0]} | {top[1]} | {top[2]} |" + ) + return "\n".join(rows) + + +TABLES = { + "scale_sweep_totals": render_scale_sweep_totals, + "memory_by_scenario": render_memory_by_scenario, + "top_phases_by_scenario": render_top_phases_by_scenario, +} + + +def update_markdown(path): + text = path.read_text() + for table_id, renderer in TABLES.items(): + body = renderer() + pattern = re.compile( + rf"()" + rf".*?" + rf"()", + re.DOTALL, + ) + replacement = f"\\g<1>\n{body}\n\\g<2>" + new_text, n = pattern.subn(replacement, text) + if n == 0: + raise RuntimeError( + f"No marker pair found for table '{table_id}' in {path}." + f" Add ..." + f" to the document first." + ) + if n > 1: + raise RuntimeError( + f"Multiple marker pairs for '{table_id}' in {path}." + ) + text = new_text + path.write_text(text) + + +def main(): + update_markdown(PLAN_MD) + print(f"regenerated tables in {PLAN_MD.relative_to(PLAN_MD.parents[2])}") + for k in TABLES: + print(f" - {k}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/mem_profile_brfss.py b/benchmarks/speed_review/mem_profile_brfss.py new file mode 100644 index 00000000..25495dce --- /dev/null +++ b/benchmarks/speed_review/mem_profile_brfss.py @@ -0,0 +1,143 @@ +""" +Per-function allocation attribution for the BRFSS-1M scenario. + +Runs the large-scale BRFSS `aggregate_survey` path under `tracemalloc` and +writes top-N allocation sites to +``benchmarks/speed_review/baselines/mem_profile_brfss_large_.txt``. + +Standalone because tracemalloc has 2-5x overhead; running it inside the +main timing harness would contaminate the wall-clock baselines. Companion +to the `resource.getrusage`-style peak RSS captured in the main JSON +baselines — this script tells us WHERE the memory went, those tell us +HOW MUCH. +""" + +import argparse +import tracemalloc +from pathlib import Path + +import numpy as np +import pandas as pd + +from diff_diff import SurveyDesign, aggregate_survey +from diff_diff._backend import HAS_RUST_BACKEND + + +BASELINES = Path(__file__).resolve().parent / "baselines" + + +def build_microdata(n_states=50, n_years=10, n_per_cell=2000, + n_strata=20, n_psu=1000, seed=42): + rng = np.random.default_rng(seed) + n_rows = n_states * n_years * n_per_cell + state = np.repeat(np.arange(n_states), n_years * n_per_cell) + year = np.tile( + np.repeat(np.arange(2010, 2010 + n_years), n_per_cell), + n_states, + ) + stratum = rng.integers(0, n_strata, size=n_rows) + psu = stratum * (n_psu // n_strata) + rng.integers( + 0, n_psu // n_strata, size=n_rows, + ) + weight = rng.lognormal(0, 0.4, size=n_rows) * 50.0 + y = ( + rng.normal(0, 1, size=n_rows) + + 0.5 * (year - 2010) + + rng.normal(0, 0.2, size=n_rows) * state + ) + return pd.DataFrame({ + "state": state, "year": year, + "strata": stratum, "psu": psu, "finalwt": weight, + "y": y, + }) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--top", type=int, default=15, + help="Show top N allocation sites") + args = parser.parse_args() + + BASELINES.mkdir(parents=True, exist_ok=True) + backend = "rust" if HAS_RUST_BACKEND else "python" + out_path = BASELINES / f"mem_profile_brfss_large_{backend}.txt" + + print("Building 1M-row BRFSS microdata...") + micro = build_microdata() + print(f" shape: {micro.shape}, mem: " + f"{micro.memory_usage(deep=True).sum()/1024/1024:.1f} MB") + + sd = SurveyDesign( + weights="finalwt", strata="strata", psu="psu", + ) + + print("Starting tracemalloc...") + tracemalloc.start(25) + snap_before = tracemalloc.take_snapshot() + + print("Running aggregate_survey...") + panel, stage2 = aggregate_survey( + micro, by=["state", "year"], outcomes="y", + survey_design=sd, + ) + + snap_after = tracemalloc.take_snapshot() + stats = snap_after.compare_to(snap_before, "lineno") + current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + total_net_diff = sum(s.size_diff for s in stats) / (1024 * 1024) + top_site_diff = ( + stats[0].size_diff / (1024 * 1024) if stats else 0.0 + ) + + lines = [ + f"# BRFSS-1M aggregate_survey allocation attribution", + f"# backend: {backend}", + f"# input microdata rows: {len(micro):,}", + f"# input microdata memory: " + f"{micro.memory_usage(deep=True).sum()/1024/1024:.1f} MB", + f"# output panel cells: {len(panel)}", + f"", + f"# tracemalloc totals during aggregate_survey", + f"# total net size diff across all sites: {total_net_diff:.1f} MB", + f"# top single-site size diff: {top_site_diff:.2f} MB", + f"# python peak traced: {peak/1024/1024:.1f} MB", + f"# python current retained: {current/1024/1024:.1f} MB", + f"", + f"# top {args.top} allocation sites by size delta", + f"{'#':<4} {'size diff (MB)':>16} {'count diff':>12} location", + f"{'-'*80}", + ] + # Scrub workstation-specific absolute paths before committing output + # (keeps the file reproducible and avoids leaking $HOME / system paths). + import site, sys as _sys + home = str(Path.home()) + sys_paths = sorted( + {p for p in (site.getsitepackages() + [site.getusersitepackages()]) + if p} | {_sys.prefix, _sys.base_prefix}, + key=len, reverse=True, + ) + repo_root = str(Path(__file__).resolve().parents[2]) + + def _scrub(s): + s = s.replace(repo_root, "") + for sp in sys_paths: + s = s.replace(sp, "") + s = s.replace(home, "$HOME") + return s + + for i, s in enumerate(stats[:args.top], 1): + loc = _scrub(str(s.traceback).split("\n")[0]) + lines.append( + f"{i:<4} {s.size_diff/1024/1024:>16.2f} {s.count_diff:>12d} {loc}" + ) + + text = "\n".join(lines) + "\n" + out_path.write_text(text) + print("\n" + text) + print(f"wrote {out_path}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/speed_review/run_all.py b/benchmarks/speed_review/run_all.py new file mode 100644 index 00000000..12b609b5 --- /dev/null +++ b/benchmarks/speed_review/run_all.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Run every practitioner-workflow scenario under both backends. + +Writes per-scenario JSON + pyinstrument HTML under +``benchmarks/speed_review/baselines/`` (and ``.../baselines/profiles/``). +See ``docs/performance-scenarios.md`` for scenario definitions and +``docs/performance-plan.md`` for the derived findings. + +Exit status is nonzero if any scenario subprocess exits nonzero. Scenario +scripts themselves exit 1 on any phase failure (see ``bench_shared.py``), +so this orchestrator reliably surfaces failures. + +Usage: + + python benchmarks/speed_review/run_all.py + python benchmarks/speed_review/run_all.py --backend python + python benchmarks/speed_review/run_all.py --backend rust + python benchmarks/speed_review/run_all.py --scenarios campaign_staggered +""" + +import argparse +import os +import subprocess +import sys +from pathlib import Path + +HERE = Path(__file__).resolve().parent +SCRIPTS = { + "campaign_staggered": "bench_campaign_staggered.py", + "brand_awareness_survey": "bench_brand_awareness_survey.py", + "brfss_panel": "bench_brfss_panel.py", + "geo_few_markets": "bench_geo_few_markets.py", + "reversible_dcdh": "bench_reversible_dcdh.py", + "dose_response": "bench_dose_response.py", +} + + +def run(scenario, backend): + script = HERE / SCRIPTS[scenario] + env = os.environ.copy() + env["DIFF_DIFF_BACKEND"] = backend + print(f"\n===== {scenario} backend={backend} =====") + result = subprocess.run( + [sys.executable, str(script)], env=env, + ) + return result.returncode == 0 + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--backend", choices=["python", "rust", "both"], default="both", + ) + parser.add_argument( + "--scenarios", nargs="+", choices=list(SCRIPTS), + default=list(SCRIPTS), + ) + args = parser.parse_args() + + if args.backend == "both": + backends = ["python", "rust"] + else: + backends = [args.backend] + + failures = [] + for backend in backends: + for scenario in args.scenarios: + if not run(scenario, backend): + failures.append((scenario, backend)) + + print("\n\n===== SUMMARY =====") + if failures: + print(f"{len(failures)} scenario/backend combos failed:") + for s, b in failures: + print(f" - {s} ({b})") + sys.exit(1) + else: + print("All scenarios passed.") + + +if __name__ == "__main__": + main() diff --git a/docs/performance-plan.md b/docs/performance-plan.md index bac34b63..58f0f017 100644 --- a/docs/performance-plan.md +++ b/docs/performance-plan.md @@ -4,6 +4,299 @@ This document outlines the strategy for improving diff-diff's performance on lar --- +## Practitioner Workflow Baseline (v3.1.3, April 2026) + +Earlier sections of this document (v1.4.0, v2.0.3) measured isolated `fit()` +calls on synthetic panels for R-parity. This section measures **end-to-end +practitioner chains** - Bacon decomposition, fit, event-study pre-trend +inspection, HonestDiD sensitivity grids, cross-estimator robustness refits, +and reporting - at data shapes anchored to applied-econ papers and industry +writeups. The six scenarios are defined in +[`docs/performance-scenarios.md`](performance-scenarios.md); scripts live in +`benchmarks/speed_review/bench_*.py`; raw results in +`benchmarks/speed_review/baselines/*.json` and flame profiles in +`benchmarks/speed_review/baselines/profiles/`. + +Environment: macOS darwin 25.3 on Apple Silicon M4, Python 3.9, +numpy 2.x, diff_diff 3.1.3. Each multi-scale scenario runs at three data +scales under both `DIFF_DIFF_BACKEND=python` and `DIFF_DIFF_BACKEND=rust`, +with one intentional exception: the SDiD few-markets scenario at its +`large` scale runs Rust only, because the pure-numpy jackknife at n=500 +would exceed four minutes per run without changing the already-clear +Python-vs-Rust conclusion established at `small` and `medium`. The +numerical tables below are auto-generated from the committed JSON +baselines by `benchmarks/speed_review/gen_findings_tables.py`; narrative +prose is hand-written and must be re-read when numbers shift. + +### Scale sweep - end-to-end wall-clock + +Four of the six scenarios run at three scales (small / medium / large). The +small scale matches tutorial data shapes; medium reflects typical +practitioner workloads; large stretches toward the upper end of what an +analyst might bring (1M-row BRFSS microdata, 1,500-unit county-level +staggered panel, 1,000-unit multi-region brand survey, 500-unit zip-level +geo-experiment). Dose-response and reversible-dCDH run at a single mid-range +scale. Data-shape details are in `docs/performance-scenarios.md`. + + +| Scenario | Scale | Python (s) | Rust (s) | Py/Rust | +|---|---|---:|---:|---:| +| 1. Staggered campaign | small | 0.51 | 0.50 | 1.0x | +| | medium | 0.75 | 0.76 | 1.0x | +| | large | 1.33 | 1.38 | 1.0x | +| 2. Brand awareness survey | small | 0.19 | 0.20 | 1.0x | +| | medium | 0.56 | 0.55 | 1.0x | +| | large | 1.09 | 1.00 | 1.1x | +| 3. BRFSS microdata -> CS panel | small | 1.61 | 1.66 | 1.0x | +| | medium | 6.10 | 6.23 | 1.0x | +| | large | 24.41 | 24.94 | 1.0x | +| 4. SDiD few markets | small | 3.70 | 0.04 | 89.5x | +| | medium | 3.99 | 0.12 | 33.6x | +| | large | skip | 0.26 | - | +| 5. Reversible dCDH | single | 0.72 | 0.75 | 1.0x | +| 6. Pricing dose-response | single | 0.59 | 0.60 | 1.0x | + + +### Scaling findings + +**Three findings are load-bearing for the optimization priority list:** + +1. **BRFSS `aggregate_survey` is the dominant practitioner pain point at + realistic pooled-multi-year scale.** Scales near-linearly with microdata + row count. At 1M rows (roughly what a 10-year pooled BRFSS analysis + looks like) the full chain takes ~24 seconds and essentially all of it + is inside `_compute_stratified_psu_meat`. Rust does not touch it + (`aggregate_survey` is entirely Python). +2. **Staggered CS chain stays cheap across scales.** A 10x unit increase + (150 -> 1,500) is a small-single-digit multiplier on total time. + ImputationDiD and SunAbraham together consistently account for + ~70-80% of the chain; either can be the single top phase at a given + (scale, backend) cell, which is a per-cell ranking detail not a + stable pattern to optimize against. +3. **SDiD Rust gap is stable across scales, not emergent.** Python SDiD + has a fixed per-jackknife-refit overhead that dominates even at small + n. Rust stays sub-second through 500 units. + +**Two findings hold across scales:** + +4. Brand-awareness survey total scales roughly linearly in n_units, but + the JK1 replicate path inside it scales closer to + n_units x n_replicates - faster growth than the chain total, so it + increasingly dominates at large n. +5. Rust backend gives large uplift only for SDiD (order-of-magnitude + and up). Elsewhere the gap is modest across all measured (scenario, + scale) cells - see the scale-sweep table for exact ratios. The + primary bottlenecks live in Python code the Rust backend does not + touch (`aggregate_survey`, JK1 replicate fit), and paths that Rust + does touch (CS bootstrap, ImputationDiD, Survey TSL) are already + well-vectorized in Python. + +### Top phases by scenario at largest measured scale + + +| Scenario | Scale | Backend | Top phase (%) | 2nd phase (%) | 3rd phase (%) | +|---|---|---|---|---|---| +| 1. Staggered campaign | large | python | `6_imputation_did_robustness` (49%) | `5_sun_abraham_robustness` (28%) | `2_cs_fit_with_covariates_bootstrap999` (13%) | +| 1. Staggered campaign | large | rust | `6_imputation_did_robustness` (40%) | `5_sun_abraham_robustness` (37%) | `2_cs_fit_with_covariates_bootstrap999` (13%) | +| 2. Brand awareness survey | large | python | `3_replicate_weights_jk1` (57%) | `4_multi_outcome_loop_3_metrics` (22%) | `7_event_study_plus_honest_did` (14%) | +| 2. Brand awareness survey | large | rust | `3_replicate_weights_jk1` (54%) | `4_multi_outcome_loop_3_metrics` (22%) | `7_event_study_plus_honest_did` (14%) | +| 3. BRFSS microdata -> CS panel | large | python | `1_aggregate_survey_microdata_to_panel` (100%) | `5_sun_abraham_robustness` (0%) | `2_cs_fit_with_stage2_survey_design` (0%) | +| 3. BRFSS microdata -> CS panel | large | rust | `1_aggregate_survey_microdata_to_panel` (100%) | `5_sun_abraham_robustness` (0%) | `2_cs_fit_with_stage2_survey_design` (0%) | +| 4. SDiD few markets | medium | python | `5_sensitivity_to_zeta_omega` (43%) | `3_in_time_placebo` (39%) | `2_sdid_bootstrap_variance_200` (9%) | +| 4. SDiD few markets | large | rust | `5_sensitivity_to_zeta_omega` (40%) | `3_in_time_placebo` (30%) | `1_sdid_jackknife_variance` (16%) | +| 5. Reversible dCDH | single | python | `4_heterogeneity_refit` (51%) | `1_dcdh_fit_Lmax3_survey_TSL` (48%) | `3_honest_did_on_placebo` (1%) | +| 5. Reversible dCDH | single | rust | `4_heterogeneity_refit` (50%) | `1_dcdh_fit_Lmax3_survey_TSL` (49%) | `3_honest_did_on_placebo` (1%) | +| 6. Pricing dose-response | single | python | `1_cdid_cubic_spline_bootstrap199` (25%) | `6_spline_sensitivity_num_knots2` (25%) | `5_spline_sensitivity_degree1` (25%) | +| 6. Pricing dose-response | single | rust | `1_cdid_cubic_spline_bootstrap199` (25%) | `6_spline_sensitivity_num_knots2` (25%) | `3_cdid_event_study_pretrend` (25%) | + + +Per-scenario phase narrative (cross-check against the table above after +any rerun): + +- **Staggered campaign.** ImputationDiD robustness and SunAbraham + consistently account for ~70-80% of the chain at every scale. They + sit in a narrow phase-share band (each typically ~25-50%) and which + one leads varies by (scale, backend) and can flip across reruns at + medium scale where the two are close; see the table for the exact + ordering per cell. CS fit with `n_bootstrap=999` (both with and + without covariates) is well-vectorized and sits well below both in + the ranking. Either phase is a legitimate optimization target; the + aggregate share is what drives the "next hotspot" priority. +- **Brand awareness survey.** At small scale HonestDiD dominates. From + medium onwards JK1 is the single largest phase under both backends; + see the table for the exact share per cell. Python and Rust totals + stay close across the sweep (within ~1.1x at any measured scale, + see scale-sweep table); the JK1 replicate-fit loop is not + Rust-accelerated, so the backends neither help nor hurt each other + meaningfully on this chain. +- **BRFSS.** `aggregate_survey` share of total grows with scale and is + effectively 100% of runtime at 1M rows. Downstream phases (CS fit, + SunAbraham, HonestDiD) are a fraction of a second combined. +- **SDiD few markets.** `sensitivity_to_zeta_omega` and + `in_time_placebo` are the two largest phases under Python at every + scale and under Rust at medium/large (together ~70% of the chain). + At Rust small the absolute cost collapses so far that per-phase + fixed overhead dominates and `2_sdid_bootstrap_variance_200` slightly + edges the other two. The difference across backends is absolute: + under Python these phases drive a multi-second chain, under Rust + they stay in the top ranks but of a sub-second total runtime. That + is the Python-vs-Rust story for this scenario. +- **Reversible dCDH.** Main fit and heterogeneity refit are the two + largest phases by design - together effectively the whole chain, + with the remainder on HonestDiD at <2%. The two phases sit within a + few percentage points of each other at this shape and the leader + can flip across reruns under either backend. Both fits run under + the same `SurveyDesign` and rebuild shared TSL scaffolding - that + is the optimization opportunity, independent of which side is + slightly larger on a given measurement. +- **Pricing dose-response.** Four spline fits account for essentially all + runtime; linear scaling in variant count. + +### Top hotspots ranked by total-time contribution + +| # | Location | Scenario + scale | Signal | Recommended action | +|---|---|---|---|---| +| 1 | `diff_diff/survey.py:1160` `_compute_stratified_psu_meat` | BRFSS @ 1M rows | dominates BRFSS chain at all scales, ~100% at 1M rows | **Algorithmic fix, highest priority.** Function called once per (state, year) cell (500 calls); per-call work rebuilds stratum-PSU scaffolding every time. Precompute stratum indexes once at `aggregate_survey` top-level and reuse. | +| 2 | `diff_diff/imputation.py` ImputationDiD fit (+ `diff_diff/sun_abraham.py` SunAbraham fit) | Staggered CS @ 1,500 units | together consistently ~70-80% of the chain at every scale; either can be the top phase at a given (scale, backend) cell | **Investigate only after BRFSS fix lands.** Total chain is well under practitioner-perceptible threshold; candidate follow-up. Either phase is a legitimate target. | +| 3 | `diff_diff/utils.py:1434` `_sc_weight_fw_numpy` | SDiD python @ any scale | dominates Python SDiD at all scales | **Already ported to Rust.** Python fallback acceptable as a teaching/safety path; non-production for n > 100. Python skipped at n=500 (jackknife cost would exceed 4 minutes per run). | +| 4 | `diff_diff/chaisemartin_dhaultfoeuille.py` dCDH fit + heterogeneity | Reversible (single scale) | main fit and survey-aware heterogeneity refit each rebuild TSL scaffolding; heterogeneity phase is as expensive as the main fit | **Cache/precompute** - heterogeneity refit duplicates the main fit's TSL setup under the same `SurveyDesign`. Not P0; newer code path (v3.1) never optimization-reviewed. | +| 5 | `diff_diff/continuous_did.py` CDiD spline bootstrap | Dose-response (single scale) | four spline fits ~equal, linear in variant count | **Leave alone** - well under perceptible threshold. | + +### Memory analysis + +End-to-end peak RSS and per-scenario growth are captured in each JSON +baseline under the `memory` field, recorded via a psutil background +sampler at 10 ms. A standalone `tracemalloc`-based allocator attribution +pass for the BRFSS-1M scenario lives at +`benchmarks/speed_review/mem_profile_brfss.py`; its scrubbed output is +in `benchmarks/speed_review/baselines/mem_profile_brfss_large_.txt`. + + +| Scenario | Scale | Py peak RSS (MB) | Py growth (MB) | Rust peak RSS (MB) | Rust growth (MB) | +|---|---|---:|---:|---:|---:| +| 1. Staggered campaign | small | 143 | 28 | 151 | 36 | +| | medium | 227 | 79 | 254 | 99 | +| | large | 472 | 245 | 588 | 322 | +| 2. Brand awareness survey | small | 127 | 12 | 128 | 13 | +| | medium | 188 | 54 | 185 | 50 | +| | large | 327 | 139 | 336 | 142 | +| 3. BRFSS microdata -> CS panel | small | 133 | 11 | 136 | 15 | +| | medium | 210 | 17 | 212 | 15 | +| | large | 418 | 17 | 429 | 33 | +| 4. SDiD few markets | small | 124 | 10 | 116 | 1 | +| | medium | 152 | 8 | 118 | 0 | +| | large | skip | skip | 118 | 0 | +| 5. Reversible dCDH | single | 135 | 22 | 135 | 21 | +| 6. Pricing dose-response | single | 123 | 9 | 121 | 8 | + + +The ~115-130 MB floor is the Python + diff-diff + numpy import footprint; +the "growth" columns are the practitioner-meaningful numbers. + +### Memory findings + +1. **BRFSS `aggregate_survey` is compute-bound, not memory-bound.** At + 20x data growth (50K -> 1M rows), working-memory growth stays in the + low tens of MB. The tracemalloc pass confirms: net retained allocation + after `aggregate_survey` returns is well under 1 MB; the top + allocation site is `tracemalloc`'s own linecache overhead (a smoking + gun that nothing else is allocating meaningfully). **The BRFSS cost + is pure CPU; the function is already memory-efficient.** This + strengthens the case for the precompute-scaffolding fix: low-risk, + pure CPU win, fits in any deployment environment including 512 MB + Lambda. +2. **Staggered CS chain is memory-heavier than wall-clock suggested.** At + 1,500 units the chain's peak RSS sits in the high-400s to high-500s + MB depending on backend. Fine for workstations, tight for 512 MB + Lambda tier. Bootstrap-999 in CS and ImputationDiD's saturated + regression are plausible drivers. Rust uses slightly more memory here + (likely FFI-held temporary array copies); not worth optimizing. +3. **JK1 replicate path is allocation-heavy at large replicate count.** + At 1,000 units × 160 replicates the chain's growth during run sits in + the mid-100s of MB (see memory table). Each replicate refit plus the + n × n_replicates weight matrix drives this. A Rust port would save + memory even though time is within noise today - the dual benefit + strengthens the case for the port if replicate counts grow. +4. **SDiD Rust path is essentially memory-free** (growth at or below a + single MB across scales). Rust does the work in native memory without + round-tripping through the Python allocator. Confirms the existing + Rust port is well-behaved on both axes. +5. **No scenario hits OOM territory at measured scales.** Peak RSS across + the whole sweep stays under 600 MB. 1 GB is a comfortable ceiling for + every scenario measured. + +### Priority of optimization opportunities + +| # | Opportunity | Time upside | Memory upside | Risk | Priority | +|---|---|---|---|---|---| +| 1 | `aggregate_survey` precompute stratum scaffolding | ~-20s at 1M rows | none (already memory-efficient) | Low | **High** | +| 2 | Staggered CS chain working-memory audit (Lambda-oriented) | none | ~200-300 MB at 1,500 units (peak RSS crosses 512 MB Lambda line under Rust) | Medium | Low (bump to Medium if Lambda deployment becomes a concrete ask) | +| 3 | dCDH: cache TSL scaffolding across main fit + heterogeneity refit | ~0.2s per chain | ~20 MB per chain | Low | Low | +| 4 | ImputationDiD fit-loop vectorization audit | ~0.1-0.3s at 1,500 units | unknown | Low | Low | +| 5 | Rust-port JK1 replicate fit loop | ~0.5s at 160 replicates | ~140 MB at 160 replicates | Medium | Low (demoted: Rust is no longer slower than Python on this path after rerun, so the "fix-a-Rust-regression" leg of the original rationale is gone) | + +**Bottom line: one clear priority, four optional.** #1 is the single +practitioner-perceptible win identified by this analysis and should be +the next PR. #2-5 are optional polish that should be prioritized by +concrete deployment-environment signal (Lambda OOMs, practitioner +reports of slowness at specific shapes), not proactively. + +### Correctness-adjacent observations (not P0, route separately) + +These are developer-ergonomics / API-consistency smells surfaced during +scenario development. None are silent-failures and none belong in this PR +or in the silent-failures audit; logging here for awareness. + +1. **`aggregate` / `level` parameter naming is inconsistent.** CS accepts + `aggregate="event_study"`; ContinuousDiD requires + `aggregate="eventstudy"` on `fit()` **but** `level="event_study"` on + `to_dataframe()`. Two different spellings within one estimator plus a + third cross-estimator spelling. Surfaced when the P1 exit-propagation + fix stopped silently swallowing the resulting `ValueError` in the + dose-response benchmark. Route: API-consistency cleanup, minor. +2. **`generate_survey_did_data(panel=True)` `treated` column.** Row-level + active-treatment indicator that is zero in pre-periods, which makes it + quietly incompatible with `check_parallel_trends` (expects unit-level + treatment group membership) and pre-period placebo tests. Tutorial 17 + does not hit this because it uses a 2x2 design where `post` discriminates + the comparison. Suggest adding a `treat_unit` column alongside `treated` + for generator output clarity. Route: DGP cleanup, minor. +3. **`SurveyDesign.replicate_method` case sensitivity.** `"jk1"` raises + `ValueError("must be one of {'Fay', 'SDR', 'BRR', 'JKn', 'JK1'}")`; + `"JK1"` works. Either normalize the input or mention the expected casing + in the error message. Route: API-ergonomics, minor. + +### What this baseline does not answer + +- OOM behaviour at the edge: the sweep captures peak RSS up to ~600 MB + (staggered CS large under Rust). Behaviour under a hard memory ceiling + (512 MB Lambda, 1 GB container) is not exercised; if deployment signal + emerges that practitioners hit those ceilings, a ceiling-test pass + should be added. +- Pure-Rust profiles: scenarios run the Rust backend as a black box. + Optimizing inside `rust/` is a separate concern owned by the crate + maintainers and is not in scope here. +- Real-data shapes: the scenarios use synthetic DGPs. The BRFSS scenario + uses a BRFSS-shaped synthetic panel, not actual BRFSS microdata. If a + real-data calibration becomes relevant, CDC BRFSS annual files are + public. + +### Reproducing + +```bash +pip install pyinstrument # one-time, dev-only +python benchmarks/speed_review/run_all.py # both backends, all scenarios + +# Single scenario, single backend: +DIFF_DIFF_BACKEND=rust python benchmarks/speed_review/bench_campaign_staggered.py +``` + +Raw JSON is written under `benchmarks/speed_review/baselines/` for +scenario-level diffing as the library evolves; flame HTMLs are written +alongside under `baselines/profiles/` (gitignored; regenerated on each run). + +--- + ## Results Achieved (v2.0.3) **v2.0.3 includes Rust backend optimizations** that further improve SyntheticDiD performance: diff --git a/docs/performance-scenarios.md b/docs/performance-scenarios.md new file mode 100644 index 00000000..3eeb89d5 --- /dev/null +++ b/docs/performance-scenarios.md @@ -0,0 +1,356 @@ +# Practitioner Workflow Scenarios for Performance Benchmarking + +This document defines the **realistic practitioner workloads** used to evaluate +diff-diff's end-to-end performance. It is the methodology input for the +per-scenario scripts under `benchmarks/speed_review/` and the findings in +`docs/performance-plan.md`. + +## Why this doc exists + +The existing `benchmarks/` suite measures **isolated `fit()` calls on synthetic +200-20,000 unit panels** against R packages for accuracy parity. That tells us +whether our point estimates and SEs match `did::att_gt` and `fixest::feols`. It +does **not** tell us what an analyst sees when they run a full 8-step Baker et +al. (2025) workflow on a real BRFSS state-policy panel or a staggered geo +campaign. Without that, any "should we optimize X?" or "should we port X to +Rust?" decision is made on intuition, not data. + +The scenarios below are the measurement surface for that decision. They are +chosen to: + +1. Cover the six practitioner decision-tree branches in + `docs/practitioner_decision_tree.rst` (simultaneous, staggered, reversible, + dose, few-markets, survey). +2. Exercise the code paths added in v3.0-v3.1 that the old `benchmarks/` never + touched: survey `SurveyDesign` (TSL, replicate weights, PSU-level + multiplier bootstrap), `aggregate_survey`, dCDH (reversible, `L_max`), + SyntheticDiD jackknife, ContinuousDiD dose-spline, and the 8-step + chain (Bacon -> fit -> HonestDiD -> cross-estimator robustness). +3. Use defensibly realistic data shapes anchored to applied-econ paper + conventions and industry writeups, **not** the 200 x 8 cookie cutter. + +This is a **measurement doc**, not a wishlist. It does not propose new +features, does not propose optimizations, and does not propose new estimators. +Anything discovered during measurement that looks like a bug gets flagged +separately and routed to the silent-failures audit, not folded into a perf PR. + +## How this doc is used + +Each scenario in section 4 defines: + +- **Persona / domain** - who runs this and why +- **Data shape** - n_units, n_periods, n_covariates, survey PSUs/strata, + microdata rows if relevant +- **Estimator + params** - including `covariates`, `n_bootstrap`, + `survey_design`, `aggregate`, any non-default knobs +- **Operation chain** - fit() is one step; the flow usually includes Bacon + decomposition, parallel-trends inspection, sensitivity analysis, aggregation, + and cross-estimator robustness. We time the **chain**, not just fit(). +- **Source anchor** - which tutorial, paper, or industry reference the + shape/workflow comes from + +For each scenario, `benchmarks/speed_review/` hosts a script +(`bench_.py`) that: + +1. Generates (or loads) the data once. +2. Runs the full operation chain under `pyinstrument` and writes a flame HTML + to `benchmarks/speed_review/baselines/profiles/[_]_.html`. +3. Writes a wall-clock JSON breakdown (per operation + total) to + `benchmarks/speed_review/baselines/[_]_.json`. + Multi-scale scenarios include the scale segment (`_small`, `_medium`, + `_large`); single-scale scenarios (dose-response, reversible-dCDH) + omit it. +4. Runs under both `DIFF_DIFF_BACKEND=python` and `DIFF_DIFF_BACKEND=rust` + when Rust is available. Scenario 4 (SDiD few markets) skips the + Python backend at the `large` scale by design because its + pure-numpy jackknife would exceed 4 minutes per run without adding + signal; every other (scenario, scale) runs under both backends. The + Python-vs-Rust gap is the primary input to Rust-expansion decisions. + +The scenario scripts are **not** meant to replace `run_benchmarks.py` (which +serves a different purpose: R-parity accuracy). They complement it. + +## Ground rules for realism + +- **No 200 x 8 synthetic panels.** The existing benchmarks already do that. + Each scenario below is either a different shape entirely or a 200 x 8 panel + wrapped in realistic downstream operations (bootstrap, survey, sensitivity). +- **End-to-end, not isolated `fit()`.** Practitioners chain operations. A 50ms + fit inside a 999-replicate bootstrap wrapped in an 8-M-value HonestDiD loop + is a ~45-second end-to-end run where 90%+ of time may be outside the fit + call the old benchmark measured. +- **Cite why the shape is realistic.** Every scenario grounds its data shape + in an applied-econ paper, a tutorial, an industry writeup, or a bundled + real dataset. If a scenario cannot cite a source for its shape, it does + not belong here. +- **Time includes I/O and prep.** The stopwatch starts at the first library + call a practitioner would write in their notebook and ends at the last + result-reporting call - `practitioner_next_steps()` or a `summary()`. Data + generation (synthetic) is outside the stopwatch; data load + (`load_mpdta()`, CSV read) is inside. + +## Scenarios + +### 1. Staggered Marketing Campaign - CS + Event Study + HonestDiD + +- **Persona / domain.** Growth / performance-marketing data scientist at a + tech or e-commerce company. A brand campaign rolls out to DMAs in two + waves; analyst needs overall lift, event-study dynamics, and a sensitivity + bound for the VP. +- **Data shape (scale sweep).** 26-period weekly panel, ~30% never-treated, + 2 covariates (`log_pop`, `baseline_spend`). Three scales: + - **small** - 150 units, 2 cohorts (GeoLift DMA-panel analog; US DMAs + cap at 210). + - **medium** - 500 units, 3 cohorts (pooled multi-region or multi-year + DMA panel). + - **large** - 1,500 units, 3 cohorts (county-level staggered policy + study; US has ~3,100 counties). +- **Estimator + params.** + ```python + CallawaySantAnna( + control_group="never_treated", + estimation_method="dr", + cluster="unit", + n_bootstrap=999, + ).fit(data, outcome="y", unit="unit", time="period", + first_treat="first_treat", covariates=["log_pop", "baseline_spend"], + aggregate="all") + ``` +- **Operation chain.** (1) `BaconDecomposition.fit()` for TWFE diagnostic; + (2) CS fit with `aggregate="all"` (populates simple, group, event_study); + (3) inspect event-study pre-period ATTs for pre-trends; (4) + `compute_honest_did(results, method="relative_magnitude", M=[0.5, 1.0, 1.5, 2.0])`; + (5) robustness: refit with `SunAbraham()` and `ImputationDiD()` for + cross-estimator comparison; (6) refit CS without covariates for the + Baker-mandated with/without comparison; (7) `practitioner_next_steps()`. +- **Source anchor.** `docs/tutorials/02_staggered_did.ipynb` (staggered DGP + pattern), `docs/tutorials/18_geo_experiments.ipynb` (DMA framing), + Callaway & Sant'Anna (2021), Baker et al. (2025) 8-step workflow from + `diff_diff/guides/llms-practitioner.txt`, GeoLift methodology docs for + DMA panel conventions. + +### 2. Brand Awareness Survey DiD - 2x2 with Survey Design + +- **Persona / domain.** Brand / market-research analytics lead at a CPG + or agency. Runs a pre/post awareness survey across test and control + markets with complex sampling (strata + PSU clusters + unequal weights). + Needs design-correct SEs or the CI is too narrow. +- **Data shape (scale sweep).** 12-period quarterly panel, high weight + variation, JK1 delete-one-PSU replicate weights (replicate count equals + the PSU count). Three scales: + - **small** - 200 units, 10 strata × 4 PSUs = 40 replicate columns + (Tutorial 17 analog). + - **medium** - 500 units, 15 strata × 6 PSUs = 90 replicate columns + (typical CPG quarterly brand-tracking wave). + - **large** - 1,000 units, 20 strata × 8 PSUs = 160 replicate columns + (multi-region brand tracking at scale, e.g. a national awareness + study with 50+ sub-markets). +- **Estimator + params.** Two variants in the same script: + ```python + # (a) Analytical TSL path + DifferenceInDifferences(robust=True).fit( + data, outcome="awareness", treatment="treated", time="post", + survey_design=SurveyDesign(weights="w", strata="stratum", + psu="cluster", fpc="fpc"), + ) + # (b) Replicate-weight path (JK1 delete-one-PSU weights produced by + # generate_survey_did_data(include_replicate_weights=True)) + SurveyDesign(weights="w", replicate_weights=rep_cols, + replicate_method="JK1") + ``` +- **Operation chain.** (1) naive `DifferenceInDifferences()` with no survey + design (for SE-inflation comparison); (2) `SurveyDesign.resolve()`; + (3) design-aware fit (TSL path); (4) design-aware fit (replicate-weight + path); (5) three funnel outcomes (awareness, consideration, purchase + intent) refit in a loop; (6) `check_parallel_trends()` and placebo pre- + period test; (7) `compute_honest_did()` with default M grid. +- **Source anchor.** `docs/tutorials/17_brand_awareness_survey.ipynb` + (workflow shape), `docs/tutorials/16_survey_did.ipynb` (SurveyDesign + API), CDC BRFSS 2024 technical docs (`_STSTR`/`_PSU`/`_LLCPWT` + variable conventions for the 10-stratum / 40-PSU shape), Rao & Scott + (1984) for design-effect weighting logic exercised by replicate path. + +### 3. BRFSS State-Policy Microdata -> CS Panel + +- **Persona / domain.** Health-policy / public-health researcher. Has BRFSS + respondent-level microdata across 10 years, wants to estimate the effect + of a staggered state policy (e.g., Medicaid expansion, smoking ban) on + a design-correct outcome using `aggregate_survey()` to collapse microdata + to a state-year panel, then a modern staggered estimator. +- **Data shape (scale sweep).** 50 states × 10 years × N respondents per + state-year cell, 5 adoption cohorts staggered over the window. Three scales: + - **small** - 50,000 rows (100/cell, 10 strata × 200 PSUs). Narrow + analytic slice on a state-year grid. + - **medium** - 250,000 rows (500/cell, 15 strata × 600 PSUs). + Mid-range analytic slice on the same state-year grid. + - **large** - 1,000,000 rows (2,000/cell, 20 strata × 1,000 PSUs). + A realistic pooled 10-year multi-state analysis - comparable to the + kind of panel built from BRFSS 2024's ~458K-record universe filtered + and pooled across years. This is where practitioners actually live. +- **Estimator + params.** + ```python + panel, stage2 = aggregate_survey( + microdata, by=["state", "year"], outcomes="y", + survey_design=SurveyDesign(weights="finalwt", strata="strata", psu="psu"), + ) + CallawaySantAnna(control_group="never_treated", estimation_method="reg", + n_bootstrap=199).fit( + panel, outcome="y_mean", unit="state", time="year", + first_treat="first_treat", survey_design=stage2, aggregate="all", + ) + compute_honest_did(results, method="relative_magnitude", M=[0.5, 1.0, 1.5]) + ``` +- **Operation chain.** (1) `aggregate_survey()` - the microdata-to-panel + collapse; (2) CS fit with the second-stage SurveyDesign returned by + `aggregate_survey` (pweight + geographic PSU clustering; `aggregate_survey` + does not stratify the collapsed cell panel) and bootstrap at PSU level; + (3) event-study pre-trend inspection; (4) HonestDiD sensitivity grid; + (5) SunAbraham robustness refit using the same second-stage pweight + SurveyDesign; (6) `practitioner_next_steps()`. +- **Source anchor.** `docs/practitioner_getting_started.rst` ("What If + You Have Survey Data?" section), CDC BRFSS 2024 overview + (cdc.gov/brfss/annual_data/2024), `diff_diff.prep.aggregate_survey` + docstring + `docs/survey-roadmap.md`, CS paper for staggered ATT(g,t) + inference. + +### 4. Geo-Experiment Few Markets - SyntheticDiD + Jackknife + +- **Persona / domain.** Growth marketing analyst running a small-market + campaign test against a pool of control markets. Too few treated for + asymptotic CS SE; uses SyntheticDiD with jackknife variance and a + breakdown diagnostic for the VP. +- **Data shape (scale sweep).** 12 weekly periods (6 pre, 6 post), + 2 latent factors. Three scales: + - **small** - 80 units, 5 treated (Tutorial 18 analog, DMA-scale + geo-experiment). + - **medium** - 200 units, 15 treated (zip-cluster-scale or + multi-DMA geo experiment). + - **large** - 500 units, 30 treated (zip-level or large-scale geo + experiment; **Python backend skipped at this scale** because the + pure-numpy Frank-Wolfe solver plus jackknife would need ~500 per-unit + refits and exceed 4 minutes per run without adding signal beyond what + medium scale already shows). +- **Estimator + params.** + ```python + SyntheticDiD(variance_method="jackknife", n_bootstrap=0).fit(...) + # then also variance_method="bootstrap", n_bootstrap=200 for comparison + ``` +- **Operation chain.** (1) SDiD fit with `variance_method="jackknife"` - + exercises the leave-one-out refit loop; (2) SDiD fit with + `variance_method="bootstrap"`, `n_bootstrap=200` for SE comparison; + (3) `results.in_time_placebo()`; (4) `results.get_loo_effects_df()`; + (5) `results.sensitivity_to_zeta_omega()`; (6) + `results.get_weight_concentration()`. The jackknife loop is the primary + time sink; `sensitivity_to_zeta_omega` also refits. +- **Source anchor.** `docs/tutorials/18_geo_experiments.ipynb`, + Arkhangelsky et al. (2021), Mercado Libre geo-experiment writeup + (medium.com/mercadolibre-tech), Meta GeoLift methodology docs + (facebookincubator.github.io/GeoLift - 10-treated / 10-20-control + convention). + +### 5. Reversible Treatment - dCDH with L_max and Survey TSL + +- **Persona / domain.** Marketing analyst measuring an always-on-with- + dark-periods campaign, or a health-policy researcher studying a policy + that switches on and off. Reversible treatment breaks every other + staggered estimator; dCDH is the only option. +- **Data shape.** 120 groups x 10 periods, single-switch pattern per group, + ~40% always-control, survey-weighted with 8 strata and 24 PSUs. Larger + than the Tutorial's 80 x 6 demo to expose the `L_max` multi-horizon + influence-function allocation that was added in v3.1. +- **Estimator + params.** + ```python + ChaisemartinDHaultfoeuille().fit( + data, outcome="y", group="group", time="period", treatment="treated", + L_max=3, + survey_design=SurveyDesign(weights="pw", strata="stratum", psu="cluster"), + ) + ``` +- **Operation chain.** (1) dCDH fit with `L_max=3` (computes `DID_l` for + l=1..3, dynamic placebos, sup-t bands, TWFE diagnostic); (2) snapshot + `placebo_effect`, `overall_att`, `joiners_att`, `leavers_att` from the + result object for pre-trend evidence and joiner/leaver inspection; + (3) `compute_honest_did()` M-grid on the placebo event study; + (4) heterogeneity refit with `heterogeneity="group"`. The TSL path for + `L_max >= 1` is newer code (v3.1) and has not been profiled. +- **Source anchor.** `docs/practitioner_decision_tree.rst` + ("Reversible Treatment (On/Off Cycles)"), de Chaisemartin & D'Haultfoeuille + (2020), NBER WP 29873 (dynamic companion), R package + `DIDmultiplegtDYN` as methodological reference, `docs/methodology/REGISTRY.md` + dCDH section, `project_dcdh_shipped.md` for v3.1 feature set. + +### 6. Pricing Dose-Response - ContinuousDiD Cubic Spline + +- **Persona / domain.** Pricing / promo analyst at a retailer. Stores + received varying discount levels; analyst wants the dose-response curve + ATT(d), not just a binarized average. Requires Strong Parallel Trends. +- **Data shape.** 500 units (stores) x 6 quarterly periods, 1 cohort at + period 3, dose drawn from log-normal (range 1-12 percentage points off + baseline price), ~30% untreated (dose = 0). This is the Tutorial 14 + shape scaled from 200 to 500 units to stress the B-spline fitting. +- **Estimator + params.** + ```python + ContinuousDiD(degree=3, num_knots=1, n_bootstrap=199).fit( + data, outcome="y", unit="unit", time="period", first_treat="first_treat", + dose="dose", aggregate="dose", + ) + ``` +- **Operation chain.** (1) CDiD fit with `aggregate="dose"` - produces + overall ATT, overall ACRT, and the dose-response curves; (2) extract + `results.to_dataframe(level="dose_response")` and + `level="group_time"` (event-study is not populated by a dose-only + fit, so it is extracted in a separate step); (3) a second CDiD fit + with `aggregate="eventstudy"` for pre-trend diagnostics (note the + spelling: `fit(aggregate="eventstudy")` with no underscore, but + `to_dataframe(level="event_study")` with underscore - see the + correctness-adjacent observations in `performance-plan.md`); + (4) compare to a binarized DiD fit on the same data to quantify + information loss from binarizing; (5) alternate `degree=1` (linear) + and (6) `num_knots=2` refits for spline-sensitivity. The dose-curve + bootstrap loop (199 reps x spline refit) is the primary time sink. +- **Source anchor.** `docs/tutorials/14_continuous_did.ipynb`, + Callaway, Goodman-Bacon & Sant'Anna (2024), `docs/methodology/REGISTRY.md` + ContinuousDiD section. + +## Backend and environment notes + +All scenarios run under both backends where available: + +```bash +DIFF_DIFF_BACKEND=python python benchmarks/speed_review/bench_.py +DIFF_DIFF_BACKEND=rust python benchmarks/speed_review/bench_.py +``` + +The Python-vs-Rust gap is the primary input to the Rust-expansion decision in +`docs/performance-plan.md`. If Python is already within 2x of Rust for a +scenario, that scenario is a weak Rust-port candidate; if Python is 10x+ +slower, it is a strong candidate. + +Apple Silicon M4 note per `TODO.md`: a spurious numpy `RuntimeWarning` on +`matmul` for N > 260 does not affect correctness but can clutter profile +output. Scripts filter this warning so profiles stay clean. + +## What is explicitly out of scope + +- **Optimizations.** This doc defines the measurement surface. Actual + performance fixes are separate PRs, each citing a specific + `docs/performance-plan.md` finding. +- **R-parity benchmarking.** That is `benchmarks/run_benchmarks.py`'s job + and remains valuable; these scenarios complement it. +- **Estimators without realistic practitioner flows.** TROP, EfficientDiD, + StackedDiD, and BaconDecomposition are exercised via the robustness + branches of scenarios 1 and 3; they do not get standalone scenarios + here. If a future practitioner tutorial gives one of them a distinct + end-to-end flow, a scenario can be added at that point. +- **Rust backend internals.** We measure the Rust backend as a black box + (backend=rust wall-clock, backend=rust profile breakdown). Optimizing + inside Rust is a separate concern handled by `rust/` crate owners. + +## Pointers + +- Scripts: `benchmarks/speed_review/bench_.py` +- Raw results: `benchmarks/speed_review/baselines/[_]_.json` +- Flame profiles: `benchmarks/speed_review/baselines/profiles/[_]_.html` + (gitignored; regenerated per run) +- Findings doc: `docs/performance-plan.md` ("Practitioner Workflow Baseline" + section - per-scenario top-5 hot phases + recommended action category)