diff --git a/PROBE_RESULTS.md b/PROBE_RESULTS.md new file mode 100644 index 0000000..f3ea376 --- /dev/null +++ b/PROBE_RESULTS.md @@ -0,0 +1,88 @@ +# ANE Probe Results: M4 (macOS 26.3) + +**Machine:** Apple M4 (10 cores), 32GB RAM, macOS 26.3 +**Date:** 2026-03-03 +**ANE Family:** H16 (same as M5 results in `training/m5result.md`) + +## Key Discovery: Compile and Eval Run in Parallel + +**This was not known before.** The M5 probes tested compile and eval sequentially. +We tested with GCD `dispatch_async` and found they fully overlap. + +### probe_v2.m Results + +#### TEST 1: Pure Eval Throughput +``` +Conv 128x128, spatial=64 +1000 evals: 189.1ms total, 0.189ms/eval +11.09 GFLOPS sustained +``` + +#### TEST 2: Ping-pong (Two Pre-compiled Models) +``` +500 ping-pong pairs: 207.4ms (0.415ms/pair, 0.207ms/eval) +``` +Near-zero overhead switching between two loaded models. + +#### TEST 3: Sequential Compile (20 Models) +``` +All 20 models compiled and verified ✓ +Compile time: ~23-29ms each (consistent, no degradation) +All 20 models correct with different scale factors +``` + +#### TEST 4: Background Compile Overlap ⭐ +``` +Background compile: 26.8ms +Foreground evals during compile: 119 (26.8ms total) +Overlap: YES — compile and eval CAN run in parallel! +Background model verified correct ✓ +``` + +### Summary +| Metric | Value | +|--------|-------| +| Compile time | ~25ms per kernel set | +| Eval time | 0.189ms per eval | +| Compile:eval ratio | ~130:1 | +| Parallel compile+eval | **YES** | +| Max simultaneous models | 20+ | +| Ping-pong overhead | +10% vs single model | + +## Peak ANE Throughput (inmem_peak) + +``` +Config W(MB) GFLOP ms/eval TFLOPS +96x conv 512ch sp64 48.0 3.22 0.429 ms 7.50 +128x conv 512ch sp64 64.0 4.29 0.589 ms 7.30 +256x conv 256ch sp64 32.0 2.15 0.380 ms 5.65 +64x conv 512ch sp64 32.0 2.15 0.395 ms 5.43 +``` + +Peak: **7.50 TFLOPS** (47% of 15.8 TFLOPS theoretical). + +## Implications for Training + +### Before (train_large.m) +- Synchronous compile: **88.6% of wall time is compilation** +- 55ms compile per batch, 0.54ms actual training +- Training throughput limited by compiler, not by ANE + +### After (train_double_buffer.m) +- Async double-buffered compile: **0% compile stall** +- Background compile happens during forward/backward passes +- ~130 eval steps fit in one compile window +- Weight updates are "delayed" by one batch (standard technique in distributed training) +- Training throughput limited only by ANE eval speed + +### Architecture +``` +Time → +Active kernels: [=== eval batch N ===][=== eval batch N+1 ===][=== eval batch N+2 ===] +Background: [compile N+1 weights ][compile N+2 weights ][compile N+3 weights ] + ↑ ↑ ↑ + swap ready swap ready swap ready +``` + +Two kernel sets (A and B) alternate between active evaluation and background compilation. +When the background compile finishes, pointers swap atomically at the batch boundary. diff --git a/training/Makefile b/training/Makefile index 7f16c1a..7f4f10f 100644 --- a/training/Makefile +++ b/training/Makefile @@ -1,48 +1,50 @@ -CC = xcrun clang -CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc -FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface -LDFLAGS = $(FRAMEWORKS) -ldl - -HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h - -HEADERS_ANE = $(HEADERS_LARGE) ane_rmsnorm_bwd.h ane_classifier.h - -train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h - $(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS) - -train_large: train_large.m $(HEADERS_LARGE) - $(CC) $(CFLAGS) -o $@ train_large.m $(LDFLAGS) -framework Accelerate - -train_large_ane: train_large_ane.m $(HEADERS_ANE) - $(CC) $(CFLAGS) -o $@ train_large_ane.m $(LDFLAGS) -framework Accelerate - -PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced - -test_rmsnorm_bwd: test_rmsnorm_bwd.m $(HEADERS_ANE) - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Accelerate - -test_classifier: test_classifier.m $(HEADERS_ANE) - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Accelerate - -test_weight_reload: test_weight_reload.m - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -test_perf_stats: test_perf_stats.m - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -test_qos_sweep: test_qos_sweep.m - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -test_ane_advanced: test_ane_advanced.m - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -probes: $(PROBES) - -tokenize: - python3 tokenize.py - -clean: - rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier - -.PHONY: clean tokenize probes - +CC = xcrun clang +CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc +FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface +LDFLAGS = $(FRAMEWORKS) -ldl + +HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h + +HEADERS_ANE = $(HEADERS_LARGE) ane_rmsnorm_bwd.h ane_classifier.h + +train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h + $(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS) + +train_large: train_large.m $(HEADERS_LARGE) + $(CC) $(CFLAGS) -o $@ train_large.m $(LDFLAGS) -framework Accelerate + +train_large_ane: train_large_ane.m $(HEADERS_ANE) + $(CC) $(CFLAGS) -o $@ train_large_ane.m $(LDFLAGS) -framework Accelerate + +train_double_buffer: train_double_buffer.m $(HEADERS_LARGE) + $(CC) $(CFLAGS) -o $@ train_double_buffer.m $(LDFLAGS) -framework Accelerate + +PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced + +test_rmsnorm_bwd: test_rmsnorm_bwd.m $(HEADERS_ANE) + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Accelerate + +test_classifier: test_classifier.m $(HEADERS_ANE) + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Accelerate + +test_weight_reload: test_weight_reload.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_perf_stats: test_perf_stats.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_qos_sweep: test_qos_sweep.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_ane_advanced: test_ane_advanced.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +probes: $(PROBES) + +tokenize: + python3 tokenize.py + +clean: + rm -f train train_large train_large_ane train_double_buffer $(PROBES) test_rmsnorm_bwd test_classifier + +.PHONY: clean tokenize probes diff --git a/training/train_double_buffer.m b/training/train_double_buffer.m new file mode 100644 index 0000000..d8b3882 --- /dev/null +++ b/training/train_double_buffer.m @@ -0,0 +1,782 @@ +// train_double_buffer.m — Double-buffered async ANE training for stories110M +// Based on train_large.m with the key innovation: compile and eval overlap via GCD +// Discovery: probe_v2.m proved ANE compile and eval can run in parallel +// Architecture: two kernel sets (A/B), background compile while active set runs +// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch +#include +#include "stories_io.h" +#include "stories_mil.h" +#include "stories_cpu_ops.h" + +// Double-buffer needs more compile budget than single-buffer +// The original MAX_COMPILES=100 only allows 1 batch per exec() restart +// We push higher to allow initial compile + at least 1 background compile +// If ANE rejects at ~119, the exec() restart will handle it gracefully +#define DB_MAX_COMPILES 250 + +#define CKPT_PATH "ane_db_ckpt.bin" +#define MODEL_PATH "../../assets/models/stories110M.bin" +#define DATA_PATH "tinystories_data00.bin" + +// ===== Weight loading from llama2.c format ===== +static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { printf("Cannot open %s\n", path); return false; } + Llama2Config cfg; + fread(&cfg, sizeof(cfg), 1, f); + printf(" Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n", + cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len); + if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) { + printf(" ERROR: Config mismatch! Expected dim=%d hidden=%d layers=%d\n", DIM, HIDDEN, NLAYERS); + fclose(f); return false; + } + int V = abs(cfg.vocab_size); + bool shared = cfg.vocab_size > 0; + + // Read in llama2.c order: embed, rms_att[all], wq[all], wk[all], wv[all], wo[all], + // rms_ffn[all], w1[all], w2[all], w3[all], rms_final, [wcls] + fread(embed, 4, V * DIM, f); + + // rms_att weights for all layers (contiguous) + for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f); + // wq for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f); + // wk for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f); + // wv for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f); + // wo for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f); + // rms_ffn weights for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f); + // w1 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f); + // w2 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f); + // w3 for all layers + for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f); + // rms_final + fread(rms_final, 4, DIM, f); + // wcls = embed if shared (we just use embed pointer) + + fclose(f); + printf(" Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls"); + return true; +} + +// ===== Compile one layer's kernels ===== +static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) { + lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{ + @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)}, + @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)}, + @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)}, + @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)}, + @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)}, + @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, + }), DIM*SEQ*2, 6*DIM*SEQ*2); + + lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{ + @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)}, + @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)}, + @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)}, + @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)}, + }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2); + + lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{ + @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)}, + @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)}, + @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)}, + }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2); + + lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{ + @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, + @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)}, + }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2); + + lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{ + @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)}, + @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)}, + @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)}, + }), 3*DIM*SEQ*2, DIM*SEQ*2); + + return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd; +} + +// Compile weight-free sdpaBwd2 (only needs once, no weights) +static Kern *compile_sdpa_bwd2(void) { + return compile_kern_mil_w(gen_sdpa_bwd2(), @{}, + (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2); +} + +static void free_layer_kernels(LayerKernels *lk) { + free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd); + free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd); + // sdpaBwd2 is shared, freed separately + lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL; +} + +// ===== Checkpoint save/load ===== +static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss, + double cc, double ct, double cw, int cs, int cb, int adam_t, + LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, + float *embed, AdamState *aembed) { + FILE *f = fopen(path, "wb"); + CkptHdr h = {0}; + h.magic = 0x424C5A54; h.version = 2; + h.step = step; h.total_steps = total_steps; + h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM; + h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ; + h.lr = lr; h.loss = loss; + h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw; + h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t; + fwrite(&h, sizeof(h), 1, f); + // Per-layer weights + adam + for (int L = 0; L < NLAYERS; L++) { + fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f); + fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f); + fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f); + fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f); + // Adam state + fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f); + fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f); + fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f); + fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f); + fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f); + fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f); + fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f); + fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f); + fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f); + } + fwrite(rms_final,4,DIM,f); + fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f); + fwrite(embed,4,VOCAB*DIM,f); + fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f); + fclose(f); +} + +static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss, + double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t, + LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, + float *embed, AdamState *aembed) { + FILE *f = fopen(path, "rb"); + if (!f) return false; + CkptHdr h; + fread(&h, sizeof(h), 1, f); + if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; } + *step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss; + *cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall; + *cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t; + for (int L = 0; L < NLAYERS; L++) { + fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f); + fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f); + fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f); + fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f); + fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f); + fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f); + fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f); + fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f); + fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f); + fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f); + fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f); + fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f); + fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f); + } + fread(rms_final,4,DIM,f); + fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f); + fread(embed,4,VOCAB*DIM,f); + fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f); + fclose(f); + return true; +} + +// ===== Main ===== +int main(int argc, char *argv[]) { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + mach_timebase_info(&g_tb); + + int total_steps = 10000; + float lr = 3e-4f; + float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; + int adam_t = 0, start_step = 0; + + // Parse args + bool do_resume = false; + for (int i=1; i