diff --git a/CONTRIBUTION_SUBMISSION.md b/CONTRIBUTION_SUBMISSION.md new file mode 100644 index 0000000..120b72e --- /dev/null +++ b/CONTRIBUTION_SUBMISSION.md @@ -0,0 +1,95 @@ +# Contribution submission guide + +This file summarizes what was done on branch `contribution/benchmark-m5-and-fixes` and how to submit it. + +--- + +## 1. Benchmark (submit to Issue #3) + +**Link:** https://github.com/maderix/ANE/issues/3 + +**Post this as a new comment:** + +``` +## M5 MacBook Pro benchmark (static pipeline, 20 steps) + +- **Chip:** Apple M5, 10-core (4P+6E) +- **RAM:** 24 GB +- **macOS:** 26.3 (Build 25D125) +- **Run:** `./train_large --data ./tinystories_data00.bin --steps 20 --lr 1e-4` + +### Efficiency report +- Total steps: 20 +- Wall time: 10423 ms (10.4 s) +- Compile time: 7187 ms (69.0%) +- Train time: 2542 ms (24.4%) +- **Avg train: 127.1 ms/step** +- ANE TFLOPS: 0.73 sustained +- ANE utilization: 4.6% of 15.8 TFLOPS + +Full output with JSON lines is in `benchmarks/my_m5_benchmark_output.txt` (or paste the contents below). +``` + +Then paste the contents of `benchmarks/my_m5_benchmark_output.txt` in the same comment, or attach it. + +--- + +## 2. Bug fix (PR) + +**Fix:** Guard short token datasets in `train_large_ane.m` and `training/training_dynamic/train.m`. + +**Why:** When `n_tokens <= SEQ + 1`, the expression `max_pos = n_tokens - SEQ - 1` underflows (unsigned), leading to a huge random range and possible out-of-bounds reads. `train_large.m` already had this guard; the other two pipelines did not. + +**Changes:** +- `training/train_large_ane.m`: After `n_tokens = data_len / 2`, add a check that fails early with a clear error, munmap and close the fd, and return 1. +- `training/training_dynamic/train.m`: Same guard added. + +**Suggested PR title:** `fix: guard short token datasets in train_large_ane and dynamic pipeline` + +**Suggested PR description:** + +```markdown +## Summary +- Add a token dataset length guard in `training/train_large_ane.m` +- Add the same guard in `training/training_dynamic/train.m` +- Fail early with a clear error when the dataset is too short for one (input, target) window + +## Why +Both paths use `max_pos = n_tokens - SEQ - 1`. When `n_tokens <= SEQ + 1`, this unsigned subtraction underflows, producing a huge range and potentially out-of-bounds reads. `train_large.m` already had this guard (lines 299–304); this PR aligns the other two pipelines. + +## Validation +- `make -C training train_large_ane` — builds +- `make -C training/training_dynamic train` — builds +- With a too-short data file, both binaries exit with the new error message. +``` + +--- + +## 3. Optional: benchmark data in repo + +Branch also adds: +- `benchmarks/my_m5_benchmark_output.txt` — full benchmark log +- One new entry in `benchmarks/community_results.json` for this M5 run (contributor: `log-wade`) + +You can either: +- Include the `community_results.json` update in the same PR as the bug fix, or +- Omit it and only post the benchmark to Issue #3 (maintainer may update the report from the issue). + +--- + +## 4. Before opening the PR + +1. **Fork the repo** on GitHub (if you haven’t): https://github.com/maderix/ANE → Fork. +2. **Add your fork as a remote and push:** + ```bash + git remote add myfork git@github.com:YOUR_USERNAME/ANE.git + git push myfork contribution/benchmark-m5-and-fixes + ``` +3. Open a PR from `myfork/contribution/benchmark-m5-and-fixes` to `maderix/ANE` main. +4. Post the benchmark comment to Issue #3 (link above). + +--- + +## 5. Replace contributor name + +In `benchmarks/community_results.json`, the new entry uses `"contributor": "log-wade"`. Change that to your GitHub username if different. diff --git a/benchmarks/community_results.json b/benchmarks/community_results.json index e975925..2f31472 100644 --- a/benchmarks/community_results.json +++ b/benchmarks/community_results.json @@ -94,6 +94,19 @@ "peak_tflops_inmem": 12.17, "notes": "inmem_peak only, no training data submitted.", "contributor": "elijah-pelton" + }, + { + "chip": "M5", + "cores": "10-core (4P+6E)", + "ram_gb": 24, + "macos": "26.3", + "ms_per_step": [125, 128], + "ane_ms": [9.1, 9.2], + "compile_ms": [3554, 3633], + "ane_tflops": [0.72, 0.74], + "ane_util_pct": [4.57, 4.70], + "notes": "MacBook Pro, static pipeline train_large, 20 steps, random init.", + "contributor": "log-wade" } ], "neural_engine_specs": { diff --git a/benchmarks/my_m5_benchmark_output.txt b/benchmarks/my_m5_benchmark_output.txt new file mode 100644 index 0000000..568a600 --- /dev/null +++ b/benchmarks/my_m5_benchmark_output.txt @@ -0,0 +1,56 @@ +=== ANE Training: Stories110M (12 layers) === +dim=768 hidden=2048 heads=12 seq=256 vocab=32000 layers=12 +Cannot open stories110M.bin +Pretrained load failed, using random init +Params: 109.53M (transformer 84.95M + embed 24.58M) +Kernels: 72 (60 weight-bearing + 12 static sdpaBwd2) +Accum 10 steps per recompile | Adam LR=1.0e-04 b1=0.9 b2=0.999 +FLOPs/step: fwd=43487M bwd_dx=43487M bwd_dW=43487M sdpa_bwd=6040M total=174248M +ANE FLOPs/step: 93013M (fwd+bwd_dx+sdpa_bwd) | CPU: dW+cls (cblas) + +Token data: 20658981 tokens (41.3 MB) + Compiling layer 1/12... (12 compiles) Compiling layer 2/12... (17 compiles) Compiling layer 3/12... (22 compiles) Compiling layer 4/12... (27 compiles) Compiling layer 5/12... (32 compiles) Compiling layer 6/12... (37 compiles) Compiling layer 7/12... (42 compiles) Compiling layer 8/12... (47 compiles) Compiling layer 9/12... (52 compiles) Compiling layer 10/12... (57 compiles) Compiling layer 11/12... (62 compiles) Compiling layer 12/12... (67 compiles) Compiled 60 kernels in 3554ms +step 0 loss=10.3907 +{"type":"step","step":0,"loss":10.390698,"t_ane":12.288,"t_io":14.233,"t_cls":30.426,"t_elem":21.143,"t_rms":0.094,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":1,"loss":10.434500,"t_ane":10.653,"t_io":13.757,"t_cls":20.472,"t_elem":18.814,"t_rms":0.070,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":2,"loss":10.484736,"t_ane":10.050,"t_io":10.094,"t_cls":16.495,"t_elem":17.783,"t_rms":0.061,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":3,"loss":10.417551,"t_ane":9.755,"t_io":8.214,"t_cls":14.512,"t_elem":16.853,"t_rms":0.068,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":4,"loss":10.392599,"t_ane":9.537,"t_io":7.032,"t_cls":13.297,"t_elem":16.319,"t_rms":0.063,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":5,"loss":10.392069,"t_ane":9.404,"t_io":6.251,"t_cls":12.475,"t_elem":15.887,"t_rms":0.060,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":6,"loss":10.382063,"t_ane":9.313,"t_io":5.697,"t_cls":11.874,"t_elem":15.678,"t_rms":0.058,"t_cblas_wait":0.001,"compiles":72} +{"type":"step","step":7,"loss":10.377501,"t_ane":9.238,"t_io":5.293,"t_cls":11.437,"t_elem":15.556,"t_rms":0.056,"t_cblas_wait":0.001,"compiles":72} +{"type":"step","step":8,"loss":10.409813,"t_ane":9.174,"t_io":4.967,"t_cls":11.101,"t_elem":15.372,"t_rms":0.055,"t_cblas_wait":0.001,"compiles":72} +{"type":"step","step":9,"loss":10.395181,"t_ane":9.138,"t_io":4.720,"t_cls":10.819,"t_elem":15.289,"t_rms":0.054,"t_cblas_wait":0.001,"compiles":72} + [batch 10: compile=3554ms train=1253.8ms (125.4ms/step) compiles=72] + ane=9.1 io=4.7 cls=10.8 elem=15.3 rms=0.1 cblas_wait=0.0 ms/step +{"type":"batch","batch":10,"compile_ms":3554.3,"train_ms":1253.8,"ms_per_step":125.4} +{"type":"perf","ane_tflops":0.742,"ane_util_pct":4.70} +[exec() restart step 10, 72 compiles, loss=10.3952] +[RESUMED step 10, loss=10.3952] +Token data: 20658981 tokens (41.3 MB) + Compiling layer 1/12... (12 compiles) Compiling layer 2/12... (17 compiles) Compiling layer 3/12... (22 compiles) Compiling layer 4/12... (27 compiles) Compiling layer 5/12... (32 compiles) Compiling layer 6/12... (37 compiles) Compiling layer 7/12... (42 compiles) Compiling layer 8/12... (47 compiles) Compiling layer 9/12... (52 compiles) Compiling layer 10/12... (57 compiles) Compiling layer 11/12... (62 compiles) Compiling layer 12/12... (67 compiles) Compiled 60 kernels in 3633ms +step 10 loss=10.2671 +{"type":"step","step":10,"loss":10.267123,"t_ane":13.398,"t_io":14.979,"t_cls":29.723,"t_elem":22.190,"t_rms":0.109,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":11,"loss":10.389436,"t_ane":11.150,"t_io":13.816,"t_cls":19.297,"t_elem":17.862,"t_rms":0.078,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":12,"loss":10.246490,"t_ane":10.356,"t_io":10.036,"t_cls":15.691,"t_elem":16.749,"t_rms":0.067,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":13,"loss":10.322395,"t_ane":9.971,"t_io":8.113,"t_cls":13.880,"t_elem":16.200,"t_rms":0.061,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":14,"loss":10.280519,"t_ane":9.708,"t_io":7.002,"t_cls":12.817,"t_elem":15.972,"t_rms":0.061,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":15,"loss":10.202168,"t_ane":9.575,"t_io":6.212,"t_cls":12.096,"t_elem":15.716,"t_rms":0.059,"t_cblas_wait":0.003,"compiles":72} +{"type":"step","step":16,"loss":10.306752,"t_ane":9.450,"t_io":5.685,"t_cls":11.577,"t_elem":15.530,"t_rms":0.057,"t_cblas_wait":0.003,"compiles":72} +{"type":"step","step":17,"loss":10.293774,"t_ane":9.361,"t_io":5.280,"t_cls":11.209,"t_elem":15.392,"t_rms":0.055,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":18,"loss":10.263789,"t_ane":9.278,"t_io":4.976,"t_cls":10.908,"t_elem":15.263,"t_rms":0.054,"t_cblas_wait":0.002,"compiles":72} +{"type":"step","step":19,"loss":10.307909,"t_ane":9.237,"t_io":4.751,"t_cls":10.657,"t_elem":15.160,"t_rms":0.053,"t_cblas_wait":0.002,"compiles":72} + [batch 10: compile=3633ms train=1287.8ms (128.8ms/step) compiles=72] + ane=9.2 io=4.8 cls=10.7 elem=15.2 rms=0.1 cblas_wait=0.0 ms/step +{"type":"batch","batch":10,"compile_ms":3632.9,"train_ms":1287.8,"ms_per_step":128.8} +{"type":"perf","ane_tflops":0.722,"ane_util_pct":4.57} + +=== Efficiency Report === +Total steps: 20 +Wall time: 10423 ms (10.4 s) +Compile time: 7187 ms (69.0%) +Train time: 2542 ms (24.4%) +Avg train: 127.1 ms/step +ANE TFLOPS: 0.73 sustained +Total TFLOPS: 1.37 (ANE+CPU) +ANE utilization: 4.6% of 15.8 TFLOPS diff --git a/training/train_large_ane.m b/training/train_large_ane.m index 6a47a3f..f775f4f 100644 --- a/training/train_large_ane.m +++ b/training/train_large_ane.m @@ -285,6 +285,12 @@ int main(int argc, char *argv[]) { uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; } size_t n_tokens = data_len / 2; + if (n_tokens <= (size_t)(SEQ + 1)) { + printf("Token data too short: need at least %d tokens, got %zu\n", SEQ + 2, n_tokens); + munmap(token_data, data_len); + close(data_fd); + return 1; + } printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); // Gradient buffers diff --git a/training/training_dynamic/train.m b/training/training_dynamic/train.m index 4249a5b..433b4af 100644 --- a/training/training_dynamic/train.m +++ b/training/training_dynamic/train.m @@ -294,6 +294,12 @@ int main(int argc, char *argv[]) { uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; } size_t n_tokens = data_len / 2; + if (n_tokens <= (size_t)(SEQ + 1)) { + printf("Token data too short: need at least %d tokens, got %zu\n", SEQ + 2, n_tokens); + munmap(token_data, data_len); + close(data_fd); + return 1; + } printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); // Vocab compaction