From df885ed3df967c7d08855a002f6c12d76b0352a5 Mon Sep 17 00:00:00 2001 From: Alvaro GPT Date: Mon, 2 Mar 2026 23:16:52 +0100 Subject: [PATCH] perf: reduce compile & IO overhead - Make ACCUM_STEPS configurable via ANE_ACCUM_STEPS env var (default 10) Higher values = fewer exec() restarts, better effective throughput - Make MAX_COMPILES configurable via ANE_MAX_COMPILES env var (default 100) Allows tuning for different hardware/OS versions - IOSurface pooling: reuse freed surfaces by size instead of creating new Avoids repeated IOSurfaceCreate/CFRelease on every recompile cycle Pool capacity: 128 surfaces with swap-remove for O(n) lookup --- training/stories_config.h | 16 +++++++++++++--- training/stories_io.h | 33 ++++++++++++++++++++++++++++++++- training/tiny_train.m | 30 +++++++++++++++++++----------- training/train_large.m | 10 ++++++---- 4 files changed, 70 insertions(+), 19 deletions(-) diff --git a/training/stories_config.h b/training/stories_config.h index f967974..1bdf07b 100644 --- a/training/stories_config.h +++ b/training/stories_config.h @@ -22,8 +22,18 @@ #define SEQ 256 #define NLAYERS 12 #define VOCAB 32000 -#define ACCUM_STEPS 10 -#define MAX_COMPILES 100 +#define ACCUM_STEPS_DEFAULT 10 +#define MAX_COMPILES_DEFAULT 100 + +static inline int get_accum_steps(void) { + const char *env = getenv("ANE_ACCUM_STEPS"); + return env ? atoi(env) : ACCUM_STEPS_DEFAULT; +} + +static inline int get_max_compiles(void) { + const char *env = getenv("ANE_MAX_COMPILES"); + return env ? atoi(env) : MAX_COMPILES_DEFAULT; +} // Per compile: 5 weight-bearing kernels per layer + 1 classifier = 5*12+1 = 61 // Plus 1 static (sdpaBwd2 per layer, no weights) = 12 more but those are weight-free @@ -86,7 +96,7 @@ typedef struct { } LayerGrads; // ANE kernels per layer -typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern; +typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; size_t inBytes, outBytes; } Kern; typedef struct { Kern *fwdAttn, *fwdFFN, *ffnBwd, *sdpaBwd1, *sdpaBwd2, *qkvBwd; } LayerKernels; diff --git a/training/stories_io.h b/training/stories_io.h index 017d8a8..4916621 100644 --- a/training/stories_io.h +++ b/training/stories_io.h @@ -3,13 +3,41 @@ #include "stories_config.h" #include +// IOSurface pool — reuse freed surfaces of the same size +#define IOSURF_POOL_MAX 128 +static struct { + IOSurfaceRef surfaces[IOSURF_POOL_MAX]; + size_t sizes[IOSURF_POOL_MAX]; + int count; +} g_iosurf_pool = { .count = 0 }; + static IOSurfaceRef make_surface(size_t bytes) { + // Check pool for matching size + for (int i = 0; i < g_iosurf_pool.count; i++) { + if (g_iosurf_pool.sizes[i] == bytes) { + IOSurfaceRef s = g_iosurf_pool.surfaces[i]; + // Swap-remove + g_iosurf_pool.surfaces[i] = g_iosurf_pool.surfaces[--g_iosurf_pool.count]; + g_iosurf_pool.sizes[i] = g_iosurf_pool.sizes[g_iosurf_pool.count]; + return s; + } + } return IOSurfaceCreate((__bridge CFDictionaryRef)@{ (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); } +static void pool_return_surface(IOSurfaceRef s, size_t bytes) { + if (g_iosurf_pool.count < IOSURF_POOL_MAX) { + g_iosurf_pool.surfaces[g_iosurf_pool.count] = s; + g_iosurf_pool.sizes[g_iosurf_pool.count] = bytes; + g_iosurf_pool.count++; + } else { + CFRelease(s); + } +} + static NSData *build_blob(const float *w, int rows, int cols) { int ws=rows*cols*2, tot=128+ws; uint8_t *b=(uint8_t*)calloc(tot,1); @@ -110,6 +138,8 @@ static Kern *compile_kern_mil_w(NSString *mil, NSDictionary *weights, int ic_byt k->model = (void*)CFBridgingRetain(mdl); k->ioIn = make_surface(ic_bytes); k->ioOut = make_surface(oc_bytes); + k->inBytes = ic_bytes; + k->outBytes = oc_bytes; id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioIn); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), k->ioOut); k->request = (void*)CFBridgingRetain(((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, @@ -123,7 +153,8 @@ static void free_kern(Kern *k) { if (!k) return; id mdl = (__bridge id)k->model; NSError *e = nil; ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - CFRelease(k->ioIn); CFRelease(k->ioOut); + pool_return_surface(k->ioIn, k->inBytes); + pool_return_surface(k->ioOut, k->outBytes); [[NSFileManager defaultManager] removeItemAtPath:(__bridge id)k->tmpDir error:nil]; CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir); free(k); diff --git a/training/tiny_train.m b/training/tiny_train.m index e1e9d7d..5db382f 100644 --- a/training/tiny_train.m +++ b/training/tiny_train.m @@ -198,9 +198,15 @@ static bool load_checkpoint(const char *path, CkptHeader *hdr, return true; } -#define MAX_COMPILES 100 +static inline int get_max_compiles_tiny(void) { + const char *env = getenv("ANE_MAX_COMPILES"); + return env ? atoi(env) : 100; +} +static inline int get_accum_steps_tiny(void) { + const char *env = getenv("ANE_ACCUM_STEPS"); + return env ? atoi(env) : 10; +} #define KERNELS_PER_STEP 4 -#define ACCUM_STEPS 10 // === Pipeline: background compile via GCD === typedef struct { @@ -231,6 +237,8 @@ int main(int argc, char *argv[]) { float lr = 1.0f; int start_step = 0; bool resuming = false; + int accum_steps = get_accum_steps_tiny(); + int max_compiles = get_max_compiles_tiny(); float *W1 = (float*)malloc(H * D * sizeof(float)); float *W2 = (float*)malloc(D * H * sizeof(float)); @@ -278,12 +286,12 @@ int main(int argc, char *argv[]) { for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); printf("=== ANE Training: Pipeline Parallel + Grad Accumulation ===\n"); printf("x:[%d,%d] -> W1:[%d,%d] -> ReLU -> W2:[%d,%d] -> y:[%d,%d]\n", S,D, H,D, D,H, S,D); - printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", ACCUM_STEPS); + printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", accum_steps); printf("ANE FP16 peak: 15.8 TFLOPS (M4) | Weights: %.1f KB\n\n", weight_bytes/1024.0); printf("FLOPs/step: ANE=%.0f (fwd+bwd) CPU=%.0f (dW) Total=%.0f\n", ane_flops_per_step, cpu_flops_per_step, total_flops_per_step); printf("Steps: %d, LR: %.4f, exec() budget: %d compiles\n\n", - total_steps, lr, MAX_COMPILES); + total_steps, lr, max_compiles); } float *x = (float*)calloc(S * D, sizeof(float)); @@ -332,7 +340,7 @@ int main(int argc, char *argv[]) { int step = start_step; while (step < total_steps) { // Check compile budget - if (g_compile_count + KERNELS_PER_STEP > MAX_COMPILES) { + if (g_compile_count + KERNELS_PER_STEP > max_compiles) { free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2, @@ -358,7 +366,7 @@ int main(int argc, char *argv[]) { // So we need to update weights BEFORE launching background compile uint64_t t_batch = mach_absolute_time(); - for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { + for (int a = 0; a < accum_steps && step < total_steps; a++, step++) { ane_eval_k(k1_fwd, x, h, D, H, S); for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; ane_eval_k(k2_fwd, h_relu, y, H, D, S); @@ -412,7 +420,7 @@ int main(int argc, char *argv[]) { // Pipeline: launch background compile with updated weights, // then immediately start NEXT batch's ANE evals with OLD kernels // while compile runs concurrently on GCD queue - bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= MAX_COMPILES); + bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= max_compiles); if (can_pipeline) { // Snapshot weights for background compile @@ -445,7 +453,7 @@ int main(int argc, char *argv[]) { int steps_overlap = 0; uint64_t t_overlap = mach_absolute_time(); - for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { + for (int a = 0; a < accum_steps && step < total_steps; a++, step++) { ane_eval_k(k1_fwd, x, h, D, H, S); for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; ane_eval_k(k2_fwd, h_relu, y, H, D, S); @@ -552,7 +560,7 @@ int main(int argc, char *argv[]) { // === Efficiency Report === printf("\n=== Efficiency Report ===\n"); printf("Total steps: %d\n", total_steps_done); - printf("Total batches: %d (accum %d steps each)\n", total_batches, ACCUM_STEPS); + printf("Total batches: %d (accum %d steps each)\n", total_batches, accum_steps); printf("Wall time: %.0f ms\n", total_wall_ms); printf("Compile time: %.0f ms (%.1f%%)\n", total_compile_ms, 100.0*total_compile_ms/total_wall_ms); printf("Train time: %.0f ms (%.1f%%)\n", total_train_ms, 100.0*total_train_ms/total_wall_ms); @@ -579,8 +587,8 @@ int main(int argc, char *argv[]) { printf("Weight params: %d (%.1f KB FP16)\n", H*D + D*H, weight_bytes / 1024.0); printf("Compile amortization: %.1f ms compile / %d steps = %.2f ms/step overhead\n", - total_compile_ms / total_batches, ACCUM_STEPS, - total_compile_ms / total_batches / ACCUM_STEPS); + total_compile_ms / total_batches, accum_steps, + total_compile_ms / total_batches / accum_steps); printf("Compile fraction: %.1f%% of wall time\n", 100.0 * total_compile_ms / total_wall_ms); printf("Train fraction: %.1f%% of wall time (useful work)\n", 100.0 * total_train_ms / total_wall_ms); diff --git a/training/train_large.m b/training/train_large.m index 17fb1c5..fa7266a 100644 --- a/training/train_large.m +++ b/training/train_large.m @@ -191,6 +191,8 @@ int main(int argc, char *argv[]) { float lr = 3e-4f; float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; int adam_t = 0, start_step = 0; + int accum_steps = get_accum_steps(); + int max_compiles = get_max_compiles(); // Parse args const char *ckpt_path = CKPT_PATH_DEFAULT; @@ -270,7 +272,7 @@ int main(int argc, char *argv[]) { printf("Params: %.2fM (transformer %.2fM + embed %.2fM)\n", tp/1e6, xfmr_params/1e6, embed_params/1e6); printf("Kernels: %d (%d weight-bearing + %d static sdpaBwd2)\n", TOTAL_WEIGHT_KERNELS+NLAYERS, TOTAL_WEIGHT_KERNELS, NLAYERS); - printf("Accum %d steps per recompile | Adam LR=%.1e b1=%.1f b2=%.3f\n", ACCUM_STEPS, lr, adam_b1, adam_b2); + printf("Accum %d steps per recompile | Adam LR=%.1e b1=%.1f b2=%.3f\n", accum_steps, lr, adam_b1, adam_b2); double fwd_f = NLAYERS*(4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ); double bwd_dx_f = fwd_f, bwd_dw_f = fwd_f; double sdpa_f = NLAYERS*2.0*HEADS*5*SEQ*SEQ*HD; @@ -331,7 +333,7 @@ int main(int argc, char *argv[]) { int step = start_step; while (step < total_steps) { // Check compile budget - if (g_compile_count + TOTAL_WEIGHT_KERNELS > MAX_COMPILES) { + if (g_compile_count + TOTAL_WEIGHT_KERNELS > max_compiles) { for (int L=0; L