diff --git a/inmem_peak.m b/inmem_peak.m
index 87b8163..5cae23c 100644
--- a/inmem_peak.m
+++ b/inmem_peak.m
@@ -5,6 +5,7 @@
 #import <dlfcn.h>
 #import <mach/mach_time.h>
 #import <IOSurface/IOSurface.h>
+#include "training/ane_compat.h"
 
 static mach_timebase_info_data_t g_tb;
 static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
@@ -27,8 +28,8 @@
 
 NSString *genMIL(int ch, int sp, int depth) {
     NSMutableString *m = [NSMutableString string];
-    [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
-    [m appendFormat:@"    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp];
+    [m appendFormat:@"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, {\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"\"}})]\n{\n", g_ane_platform.mil_program];
+    [m appendFormat:@"    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ane_mil_target(), ch, sp];
     [m appendString:@"            string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n"
         @"            tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
         @"            tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
@@ -89,6 +90,8 @@
 int main() {
     mach_timebase_info(&g_tb);
     dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",RTLD_NOW);
+    ane_detect_platform();
+    ane_print_platform();
     printf("=== Programmatic MIL → In-Memory ANE Peak ===\n\n");
     printf("%-28s %7s %7s %9s %7s %6s\n","Config","W(MB)","GFLOP","ms/eval","TFLOPS","%%peak");
     printf("----------------------------------------------------------------------\n");
@@ -104,7 +107,7 @@ int main() {
         char l[64]; snprintf(l,64,"%dx conv %dch sp%d",d,c,s);
         double ms=bench(c,s,d);
         double tf=ms>0?gf/ms:0;
-        if(ms>0)printf("%-28s %6.1f  %6.2f  %7.3f ms %6.2f  %5.1f%%\n",l,w,gf,ms,tf,tf/0.019*100);
+        if(ms>0)printf("%-28s %6.1f  %6.2f  %7.3f ms %6.2f  %5.1f%%\n",l,w,gf,ms,tf,tf/ane_peak_tflops()*100);
         else printf("%-28s %6.1f  %6.2f  FAIL(%.0f)\n",l,w,gf,ms);
     }
     return 0;
diff --git a/training/ane_compat.h b/training/ane_compat.h
new file mode 100644
index 0000000..8c5ed22
--- /dev/null
+++ b/training/ane_compat.h
@@ -0,0 +1,224 @@
+// ane_compat.h — Runtime platform detection for Apple Silicon ANE compatibility
+// Detects chip family, macOS version, ANE peak TFLOPS, and appropriate MIL target
+#pragma once
+#import <Foundation/Foundation.h>
+#include <sys/sysctl.h>
+#include <string.h>
+#include <stdio.h>
+
+// Chip family enumeration
+typedef enum {
+    ANE_CHIP_UNKNOWN = 0,
+    ANE_CHIP_M1,
+    ANE_CHIP_M1_PRO,
+    ANE_CHIP_M1_MAX,
+    ANE_CHIP_M1_ULTRA,
+    ANE_CHIP_M2,
+    ANE_CHIP_M2_PRO,
+    ANE_CHIP_M2_MAX,
+    ANE_CHIP_M2_ULTRA,
+    ANE_CHIP_M3,
+    ANE_CHIP_M3_PRO,
+    ANE_CHIP_M3_MAX,
+    ANE_CHIP_M3_ULTRA,
+    ANE_CHIP_M4,
+    ANE_CHIP_M4_PRO,
+    ANE_CHIP_M4_MAX,
+    ANE_CHIP_M4_ULTRA,
+    ANE_CHIP_M5,
+    ANE_CHIP_M5_PRO,
+    ANE_CHIP_M5_MAX,
+    ANE_CHIP_M5_ULTRA,
+} ANEChipFamily;
+
+// Platform info resolved at runtime
+typedef struct {
+    ANEChipFamily chip;
+    char chip_name[64];       // e.g. "Apple M4"
+    int macos_major;          // e.g. 14, 15
+    int macos_minor;          // e.g. 0, 1
+    double ane_peak_tflops;   // Estimated FP16 peak TFLOPS
+    const char *mil_target;   // "ios16", "ios17", or "ios18"
+    const char *mil_program;  // "1.0" for ios16/17, "1.3" for ios18
+    bool api_available;       // Whether _ANEInMemoryModel is available
+} ANEPlatform;
+
+// Global platform info (set once by ane_detect_platform)
+static ANEPlatform g_ane_platform = {0};
+static bool g_ane_platform_detected = false;
+
+// ---- Internal helpers ----
+
+static ANEChipFamily _ane_identify_chip(const char *brand) {
+    // Match chip family from sysctl brand string (e.g. "Apple M4", "Apple M2 Pro")
+    if (strstr(brand, "M5 Ultra"))  return ANE_CHIP_M5_ULTRA;
+    if (strstr(brand, "M5 Max"))    return ANE_CHIP_M5_MAX;
+    if (strstr(brand, "M5 Pro"))    return ANE_CHIP_M5_PRO;
+    if (strstr(brand, "M5"))        return ANE_CHIP_M5;
+    if (strstr(brand, "M4 Ultra"))  return ANE_CHIP_M4_ULTRA;
+    if (strstr(brand, "M4 Max"))    return ANE_CHIP_M4_MAX;
+    if (strstr(brand, "M4 Pro"))    return ANE_CHIP_M4_PRO;
+    if (strstr(brand, "M4"))        return ANE_CHIP_M4;
+    if (strstr(brand, "M3 Ultra"))  return ANE_CHIP_M3_ULTRA;
+    if (strstr(brand, "M3 Max"))    return ANE_CHIP_M3_MAX;
+    if (strstr(brand, "M3 Pro"))    return ANE_CHIP_M3_PRO;
+    if (strstr(brand, "M3"))        return ANE_CHIP_M3;
+    if (strstr(brand, "M2 Ultra"))  return ANE_CHIP_M2_ULTRA;
+    if (strstr(brand, "M2 Max"))    return ANE_CHIP_M2_MAX;
+    if (strstr(brand, "M2 Pro"))    return ANE_CHIP_M2_PRO;
+    if (strstr(brand, "M2"))        return ANE_CHIP_M2;
+    if (strstr(brand, "M1 Ultra"))  return ANE_CHIP_M1_ULTRA;
+    if (strstr(brand, "M1 Max"))    return ANE_CHIP_M1_MAX;
+    if (strstr(brand, "M1 Pro"))    return ANE_CHIP_M1_PRO;
+    if (strstr(brand, "M1"))        return ANE_CHIP_M1;
+    return ANE_CHIP_UNKNOWN;
+}
+
+// Estimated FP16 ANE peak TFLOPS per chip.
+// Apple publishes INT8 TOPS; FP16 throughput is roughly half.
+// Values are best-effort estimates from known hardware specs.
+// Ultra variants double the base die's ANE (2x neural engines).
+static double _ane_peak_tflops(ANEChipFamily chip) {
+    switch (chip) {
+        case ANE_CHIP_M1:       return 5.5;
+        case ANE_CHIP_M1_PRO:   return 5.5;
+        case ANE_CHIP_M1_MAX:   return 5.5;
+        case ANE_CHIP_M1_ULTRA: return 11.0;
+        case ANE_CHIP_M2:       return 7.9;   // 15.8 TOPS / 2
+        case ANE_CHIP_M2_PRO:   return 7.9;
+        case ANE_CHIP_M2_MAX:   return 7.9;
+        case ANE_CHIP_M2_ULTRA: return 15.8;
+        case ANE_CHIP_M3:       return 9.0;   // 18 TOPS / 2
+        case ANE_CHIP_M3_PRO:   return 9.0;
+        case ANE_CHIP_M3_MAX:   return 9.0;
+        case ANE_CHIP_M3_ULTRA: return 18.0;
+        case ANE_CHIP_M4:       return 15.8;  // Empirically measured in this project
+        case ANE_CHIP_M4_PRO:   return 15.8;
+        case ANE_CHIP_M4_MAX:   return 15.8;
+        case ANE_CHIP_M4_ULTRA: return 31.6;
+        case ANE_CHIP_M5:       return 19.0;  // 38 TOPS / 2 (estimate)
+        case ANE_CHIP_M5_PRO:   return 19.0;
+        case ANE_CHIP_M5_MAX:   return 19.0;
+        case ANE_CHIP_M5_ULTRA: return 38.0;
+        default:                return 15.8;  // Fallback: assume M4-class
+    }
+}
+
+static const char *_ane_chip_name_str(ANEChipFamily chip) {
+    switch (chip) {
+        case ANE_CHIP_M1:       return "M1";
+        case ANE_CHIP_M1_PRO:   return "M1 Pro";
+        case ANE_CHIP_M1_MAX:   return "M1 Max";
+        case ANE_CHIP_M1_ULTRA: return "M1 Ultra";
+        case ANE_CHIP_M2:       return "M2";
+        case ANE_CHIP_M2_PRO:   return "M2 Pro";
+        case ANE_CHIP_M2_MAX:   return "M2 Max";
+        case ANE_CHIP_M2_ULTRA: return "M2 Ultra";
+        case ANE_CHIP_M3:       return "M3";
+        case ANE_CHIP_M3_PRO:   return "M3 Pro";
+        case ANE_CHIP_M3_MAX:   return "M3 Max";
+        case ANE_CHIP_M3_ULTRA: return "M3 Ultra";
+        case ANE_CHIP_M4:       return "M4";
+        case ANE_CHIP_M4_PRO:   return "M4 Pro";
+        case ANE_CHIP_M4_MAX:   return "M4 Max";
+        case ANE_CHIP_M4_ULTRA: return "M4 Ultra";
+        case ANE_CHIP_M5:       return "M5";
+        case ANE_CHIP_M5_PRO:   return "M5 Pro";
+        case ANE_CHIP_M5_MAX:   return "M5 Max";
+        case ANE_CHIP_M5_ULTRA: return "M5 Ultra";
+        default:                return "Unknown";
+    }
+}
+
+// ---- Public API ----
+
+// Detect the current platform. Call once at startup.
+// Returns the populated ANEPlatform struct (also stored in g_ane_platform).
+static ANEPlatform ane_detect_platform(void) {
+    if (g_ane_platform_detected) return g_ane_platform;
+
+    ANEPlatform p = {0};
+
+    // 1. Detect chip via sysctl
+    char brand[128] = {0};
+    size_t len = sizeof(brand);
+    if (sysctlbyname("machdep.cpu.brand_string", brand, &len, NULL, 0) != 0) {
+        // Fallback: try hw.machine or hw.model
+        len = sizeof(brand);
+        sysctlbyname("hw.model", brand, &len, NULL, 0);
+    }
+    strncpy(p.chip_name, brand, sizeof(p.chip_name) - 1);
+    p.chip = _ane_identify_chip(brand);
+
+    // 2. Detect macOS version
+    NSOperatingSystemVersion ver = [[NSProcessInfo processInfo] operatingSystemVersion];
+    p.macos_major = (int)ver.majorVersion;
+    p.macos_minor = (int)ver.minorVersion;
+
+    // 3. Set ANE peak TFLOPS
+    p.ane_peak_tflops = _ane_peak_tflops(p.chip);
+
+    // 4. Select MIL target based on macOS version
+    //    - macOS 15+ (Sequoia)  → ios18 + program(1.3)
+    //    - macOS 14  (Sonoma)   → ios17 + program(1.0)
+    //    - macOS 13  (Ventura)  → ios16 + program(1.0)
+    //    - older                → unsupported
+    if (p.macos_major >= 15) {
+        p.mil_target = "ios18";
+        p.mil_program = "1.3";
+    } else if (p.macos_major == 14) {
+        p.mil_target = "ios17";
+        p.mil_program = "1.0";
+    } else if (p.macos_major == 13) {
+        p.mil_target = "ios16";
+        p.mil_program = "1.0";
+    } else {
+        p.mil_target = "ios16";
+        p.mil_program = "1.0";
+    }
+
+    // 5. Check API availability
+    p.api_available = (NSClassFromString(@"_ANEInMemoryModelDescriptor") != nil &&
+                       NSClassFromString(@"_ANEInMemoryModel") != nil);
+
+    g_ane_platform = p;
+    g_ane_platform_detected = true;
+    return p;
+}
+
+// Print detected platform info (call after ane_detect_platform)
+static void ane_print_platform(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    const ANEPlatform *p = &g_ane_platform;
+    printf("=== ANE Platform ===\n");
+    printf("  Chip:       %s (%s)\n", _ane_chip_name_str(p->chip), p->chip_name);
+    printf("  macOS:      %d.%d\n", p->macos_major, p->macos_minor);
+    printf("  ANE peak:   %.1f TFLOPS (FP16 est.)\n", p->ane_peak_tflops);
+    printf("  MIL target: %s (program %s)\n", p->mil_target, p->mil_program);
+    printf("  API ready:  %s\n", p->api_available ? "YES" : "NO");
+    printf("====================\n");
+}
+
+// Generate the MIL header string with correct program version and build info.
+// Returns an autoreleased NSString.
+static NSString *ane_mil_header(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    return [NSString stringWithFormat:
+        @"program(%s)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n{\n",
+        g_ane_platform.mil_program];
+}
+
+// Get the MIL function target annotation (e.g. "ios17" or "ios18")
+static const char *ane_mil_target(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    return g_ane_platform.mil_target;
+}
+
+// Get the ANE peak TFLOPS for utilization calculations
+static double ane_peak_tflops(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    return g_ane_platform.ane_peak_tflops;
+}
diff --git a/training/ane_mil_gen.h b/training/ane_mil_gen.h
index 97fc451..80694b5 100644
--- a/training/ane_mil_gen.h
+++ b/training/ane_mil_gen.h
@@ -1,208 +1,213 @@
-// ane_mil_gen.h — Generate MIL text for conv-based linear ops + weight blobs
-#pragma once
-#import <Foundation/Foundation.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-// Build an FP16 weight blob with the required header structure.
-// weights_f32: source weights in row-major [out_ch, in_ch]
-// Returns NSData with header + FP16 weights
-static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int in_ch) {
-    NSUInteger wsize = (NSUInteger)out_ch * in_ch * 2; // FP16
-    NSUInteger total = 64 + 64 + wsize; // global header + chunk header + data
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0] = 0x01; buf[4] = 0x02;
-    uint8_t *chunk = buf + 64;
-    chunk[0] = 0xEF; chunk[1] = 0xBE; chunk[2] = 0xAD; chunk[3] = 0xDE;
-    chunk[4] = 0x01;
-    *(uint32_t*)(chunk + 8) = (uint32_t)wsize;   // data_size
-    *(uint32_t*)(chunk + 16) = 128;               // data_offset (from file start)
-    // Convert f32 → fp16 (simple truncation via _Float16)
-    _Float16 *fp16 = (_Float16*)(buf + 128);
-    for (NSUInteger i = 0; i < (NSUInteger)out_ch * in_ch; i++)
-        fp16[i] = (_Float16)weights_f32[i];
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-// Generate MIL for a single matmul: y = W @ x (using matmul op, weights as input)
-// Input x: [1, in_ch, spatial] fp32
-// Input W: [1, out_ch, in_ch] fp32
-// Output:  [1, out_ch, spatial] fp32
-static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
-    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
-        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
-        "        tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n"
-        "        bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
-        "        bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
-        "        tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
-        "    } -> (y);\n"
-        "}\n",
-        in_ch, spatial, out_ch, in_ch,
-        in_ch, spatial, out_ch, in_ch,
-        out_ch, spatial, out_ch, spatial];
-}
-
-// Keep the baked-weight version for reference (used in inference-only scenarios)
-static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
-    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
-        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
-        "    } -> (y);\n"
-        "}\n",
-        in_ch, spatial, in_ch, spatial,
-        out_ch, in_ch, out_ch, in_ch,
-        out_ch, spatial, out_ch, spatial];
-}
-
-// Generate MIL for fused QKV: 3 parallel convs from same input
-// Input:  [1, dim, 1, S]
-// Outputs: Q[1, dim, 1, S], K[1, dim, 1, S], V[1, dim, 1, S]
-// Weight blob layout: Wq[dim,dim] @ offset 64, Wk @ offset 64+cs, Wv @ offset 64+2*cs
-// where cs = 64 + dim*dim*2
-static NSString *mil_gen_qkv(int dim, int spatial) {
-    NSUInteger cs = 64 + (NSUInteger)dim * dim * 2;
-    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
-        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> q16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> k16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> v16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n"
-        "    } -> (q, k, v);\n"
-        "}\n",
-        dim, spatial, dim, spatial,
-        dim, dim, dim, dim,
-        dim, dim, dim, dim, (unsigned long)(64 + cs),
-        dim, dim, dim, dim, (unsigned long)(64 + 2*cs),
-        dim, spatial, dim, spatial, dim, spatial,
-        dim, spatial, dim, spatial, dim, spatial];
-}
-
-// Build weight blob for fused QKV (3 weight matrices concatenated)
-static NSData *mil_build_qkv_weight_blob(const float *wq, const float *wk, const float *wv, int dim) {
-    NSUInteger wsize = (NSUInteger)dim * dim * 2;
-    NSUInteger cs = 64 + wsize;
-    NSUInteger total = 64 + 3 * cs;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0] = 0x01; buf[4] = 0x02;
-    const float *ws[3] = {wq, wk, wv};
-    for (int w = 0; w < 3; w++) {
-        uint8_t *chunk = buf + 64 + w * cs;
-        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
-        chunk[4]=0x01;
-        *(uint32_t*)(chunk + 8) = (uint32_t)wsize;
-        *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset
-        _Float16 *fp16 = (_Float16*)(chunk + 64);
-        for (NSUInteger i = 0; i < (NSUInteger)dim * dim; i++)
-            fp16[i] = (_Float16)ws[w][i];
-    }
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-// Build weight blob for fused FFN up (w1 + w3, both [hidden_dim, dim])
-static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, int hidden_dim, int dim) {
-    NSUInteger wsize = (NSUInteger)hidden_dim * dim * 2;
-    NSUInteger cs = 64 + wsize;
-    NSUInteger total = 64 + 2 * cs;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0] = 0x01; buf[4] = 0x02;
-    const float *ws[2] = {w1, w3};
-    for (int w = 0; w < 2; w++) {
-        uint8_t *chunk = buf + 64 + w * cs;
-        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
-        chunk[4]=0x01;
-        *(uint32_t*)(chunk + 8) = (uint32_t)wsize;
-        *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset
-        _Float16 *fp16 = (_Float16*)(chunk + 64);
-        for (NSUInteger i = 0; i < (NSUInteger)hidden_dim * dim; i++)
-            fp16[i] = (_Float16)ws[w][i];
-    }
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-// Generate MIL for fused FFN up: w1 + w3 parallel convs
-static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) {
-    NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2;
-    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
-        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n"
-        "    } -> (out1, out3);\n"
-        "}\n",
-        dim, spatial, dim, spatial,
-        hidden_dim, dim, hidden_dim, dim,
-        hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs),
-        hidden_dim, spatial, hidden_dim, spatial,
-        hidden_dim, spatial, hidden_dim, spatial];
-}
+// ane_mil_gen.h — Generate MIL text for conv-based linear ops + weight blobs
+#pragma once
+#import <Foundation/Foundation.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "ane_compat.h"
+
+// Build an FP16 weight blob with the required header structure.
+// weights_f32: source weights in row-major [out_ch, in_ch]
+// Returns NSData with header + FP16 weights
+static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int in_ch) {
+    NSUInteger wsize = (NSUInteger)out_ch * in_ch * 2; // FP16
+    NSUInteger total = 64 + 64 + wsize; // global header + chunk header + data
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    uint8_t *chunk = buf + 64;
+    chunk[0] = 0xEF; chunk[1] = 0xBE; chunk[2] = 0xAD; chunk[3] = 0xDE;
+    chunk[4] = 0x01;
+    *(uint32_t*)(chunk + 8) = (uint32_t)wsize;   // data_size
+    *(uint32_t*)(chunk + 16) = 128;               // data_offset (from file start)
+    // Convert f32 → fp16 (simple truncation via _Float16)
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (NSUInteger i = 0; i < (NSUInteger)out_ch * in_ch; i++)
+        fp16[i] = (_Float16)weights_f32[i];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Generate MIL for a single matmul: y = W @ x (using matmul op, weights as input)
+// Input x: [1, in_ch, spatial] fp32
+// Input W: [1, out_ch, in_ch] fp32
+// Output:  [1, out_ch, spatial] fp32
+static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
+    return [NSString stringWithFormat:
+        @"program(%s)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n"
+        "{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
+        "        tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n"
+        "        bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
+        "        bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
+        "        tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
+        "    } -> (y);\n"
+        "}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
+        in_ch, spatial, out_ch, in_ch,
+        in_ch, spatial, out_ch, in_ch,
+        out_ch, spatial, out_ch, spatial];
+}
+
+// Keep the baked-weight version for reference (used in inference-only scenarios)
+static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
+    return [NSString stringWithFormat:
+        @"program(%s)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n"
+        "{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
+        "    } -> (y);\n"
+        "}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
+        in_ch, spatial, in_ch, spatial,
+        out_ch, in_ch, out_ch, in_ch,
+        out_ch, spatial, out_ch, spatial];
+}
+
+// Generate MIL for fused QKV: 3 parallel convs from same input
+// Input:  [1, dim, 1, S]
+// Outputs: Q[1, dim, 1, S], K[1, dim, 1, S], V[1, dim, 1, S]
+// Weight blob layout: Wq[dim,dim] @ offset 64, Wk @ offset 64+cs, Wv @ offset 64+2*cs
+// where cs = 64 + dim*dim*2
+static NSString *mil_gen_qkv(int dim, int spatial) {
+    NSUInteger cs = 64 + (NSUInteger)dim * dim * 2;
+    return [NSString stringWithFormat:
+        @"program(%s)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n"
+        "{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> q16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> k16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> v16 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n"
+        "    } -> (q, k, v);\n"
+        "}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
+        dim, spatial, dim, spatial,
+        dim, dim, dim, dim,
+        dim, dim, dim, dim, (unsigned long)(64 + cs),
+        dim, dim, dim, dim, (unsigned long)(64 + 2*cs),
+        dim, spatial, dim, spatial, dim, spatial,
+        dim, spatial, dim, spatial, dim, spatial];
+}
+
+// Build weight blob for fused QKV (3 weight matrices concatenated)
+static NSData *mil_build_qkv_weight_blob(const float *wq, const float *wk, const float *wv, int dim) {
+    NSUInteger wsize = (NSUInteger)dim * dim * 2;
+    NSUInteger cs = 64 + wsize;
+    NSUInteger total = 64 + 3 * cs;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    const float *ws[3] = {wq, wk, wv};
+    for (int w = 0; w < 3; w++) {
+        uint8_t *chunk = buf + 64 + w * cs;
+        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
+        chunk[4]=0x01;
+        *(uint32_t*)(chunk + 8) = (uint32_t)wsize;
+        *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset
+        _Float16 *fp16 = (_Float16*)(chunk + 64);
+        for (NSUInteger i = 0; i < (NSUInteger)dim * dim; i++)
+            fp16[i] = (_Float16)ws[w][i];
+    }
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Build weight blob for fused FFN up (w1 + w3, both [hidden_dim, dim])
+static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, int hidden_dim, int dim) {
+    NSUInteger wsize = (NSUInteger)hidden_dim * dim * 2;
+    NSUInteger cs = 64 + wsize;
+    NSUInteger total = 64 + 2 * cs;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    const float *ws[2] = {w1, w3};
+    for (int w = 0; w < 2; w++) {
+        uint8_t *chunk = buf + 64 + w * cs;
+        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
+        chunk[4]=0x01;
+        *(uint32_t*)(chunk + 8) = (uint32_t)wsize;
+        *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset
+        _Float16 *fp16 = (_Float16*)(chunk + 64);
+        for (NSUInteger i = 0; i < (NSUInteger)hidden_dim * dim; i++)
+            fp16[i] = (_Float16)ws[w][i];
+    }
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Generate MIL for fused FFN up: w1 + w3 parallel convs
+static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) {
+    NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2;
+    return [NSString stringWithFormat:
+        @"program(%s)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n"
+        "{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
+        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = c_dilations, groups = c_groups, "
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n"
+        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n"
+        "    } -> (out1, out3);\n"
+        "}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
+        dim, spatial, dim, spatial,
+        hidden_dim, dim, hidden_dim, dim,
+        hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs),
+        hidden_dim, spatial, hidden_dim, spatial,
+        hidden_dim, spatial, hidden_dim, spatial];
+}
diff --git a/training/stories_config.h b/training/stories_config.h
index f967974..d55e115 100644
--- a/training/stories_config.h
+++ b/training/stories_config.h
@@ -1,189 +1,190 @@
-// stories_config.h — Stories110M model config and structures
-#pragma once
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-#import <Accelerate/Accelerate.h>
-#include <math.h>
-#include <unistd.h>
-#include <dispatch/dispatch.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-// Stories110M config
-#define DIM 768
-#define HIDDEN 2048
-#define HEADS 12
-#define HD (DIM/HEADS)
-#define SEQ 256
-#define NLAYERS 12
-#define VOCAB 32000
-#define ACCUM_STEPS 10
-#define MAX_COMPILES 100
-
-// Per compile: 5 weight-bearing kernels per layer + 1 classifier = 5*12+1 = 61
-// Plus 1 static (sdpaBwd2 per layer, no weights) = 12 more but those are weight-free
-// Actually sdpaBwd2 has no weights, compile once per layer
-// Weight-bearing: fwdAttn(1) + fwdFFN(1) + ffnBwd(1) + sdpaBwd1(1) + qkvBwd(1) = 5 per layer
-// 5 * 12 = 60 weight-bearing compiles per batch
-// With MAX_COMPILES=100, we get 1 batch of ACCUM_STEPS before restart
-#define KERNELS_PER_LAYER 5
-#define TOTAL_WEIGHT_KERNELS (KERNELS_PER_LAYER * NLAYERS)
-
-// Attention score channels for SDPA backward
-#define SCORE_CH (HEADS*SEQ)
-
-// Weight sizes per layer
-#define WQ_SZ (DIM*DIM)
-#define WO_SZ (DIM*DIM)
-#define W1_SZ (HIDDEN*DIM)
-#define W2_SZ (DIM*HIDDEN)
-#define W3_SZ (HIDDEN*DIM)
-#define LAYER_PARAMS (4*WQ_SZ + W1_SZ + W2_SZ + W3_SZ + 2*DIM)
-#define TOTAL_PARAMS (NLAYERS * LAYER_PARAMS + DIM + VOCAB*DIM)  // +rms_final+embed
-
-// Per-layer weight and optimizer state
-typedef struct {
-    float *Wq, *Wk, *Wv, *Wo;
-    float *W1, *W2, *W3;
-    float *rms_att, *rms_ffn;
-} LayerWeights;
-
-typedef struct {
-    float *m, *v;
-    size_t n;
-} AdamState;
-
-typedef struct {
-    AdamState Wq, Wk, Wv, Wo;
-    AdamState W1, W2, W3;
-    AdamState rms_att, rms_ffn;
-} LayerAdam;
-
-// Per-layer activation buffers (saved for backward)
-typedef struct {
-    float *layer_in;    // [DIM, SEQ] input to this layer (for rmsnorm1 bwd)
-    float *xnorm;      // [DIM, SEQ] rmsnorm1 output
-    float *Q, *K, *V;  // [DIM, SEQ] QKV projections
-    float *attn_out;    // [DIM, SEQ] attention output (before Wo)
-    float *o_out;       // [DIM, SEQ] Wo output
-    float *x2;          // [DIM, SEQ] residual after attn
-    float *x2norm;      // [DIM, SEQ] rmsnorm2 output
-    float *h1, *h3;     // [HIDDEN, SEQ] FFN intermediates
-    float *silu_out;    // [HIDDEN, SEQ] SiLU(h1)*h3
-    float *ffn_out;     // [DIM, SEQ] FFN output
-} LayerActs;
-
-// Per-layer gradient accumulators
-typedef struct {
-    float *Wq, *Wk, *Wv, *Wo;
-    float *W1, *W2, *W3;
-    float *rms_att, *rms_ffn;
-} LayerGrads;
-
-// ANE kernels per layer
-typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern;
-typedef struct {
-    Kern *fwdAttn, *fwdFFN, *ffnBwd, *sdpaBwd1, *sdpaBwd2, *qkvBwd;
-} LayerKernels;
-
-// Checkpoint header
-typedef struct {
-    int magic;          // 0x424C5A54 "BLZT"
-    int version;        // 2
-    int step, total_steps;
-    int n_layers, vocab_size, dim, hidden_dim, n_heads, seq_len;
-    float lr, loss;
-    double cum_compile, cum_train, cum_wall;
-    int cum_steps, cum_batches;
-    int adam_t;
-    int pad[3];         // alignment
-} CkptHdr;
-
-// llama2.c model file header
-typedef struct {
-    int dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len;
-} Llama2Config;
-
-// Globals
-static Class g_D, g_I, g_AR, g_AIO;
-static mach_timebase_info_data_t g_tb;
-static int g_compile_count = 0;
-
-static void ane_init(void) {
-    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_I  = NSClassFromString(@"_ANEInMemoryModel");
-    g_AR = NSClassFromString(@"_ANERequest");
-    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-}
-static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
-
-// Alloc helpers
-static AdamState adam_alloc(size_t n) { AdamState s; s.m=(float*)calloc(n,4); s.v=(float*)calloc(n,4); s.n=n; return s; }
-static void adam_free(AdamState *s) { free(s->m); free(s->v); }
-
-static LayerWeights layer_weights_alloc(void) {
-    LayerWeights w;
-    w.Wq=(float*)malloc(WQ_SZ*4); w.Wk=(float*)malloc(WQ_SZ*4);
-    w.Wv=(float*)malloc(WQ_SZ*4); w.Wo=(float*)malloc(WO_SZ*4);
-    w.W1=(float*)malloc(W1_SZ*4); w.W2=(float*)malloc(W2_SZ*4); w.W3=(float*)malloc(W3_SZ*4);
-    w.rms_att=(float*)malloc(DIM*4); w.rms_ffn=(float*)malloc(DIM*4);
-    return w;
-}
-static void layer_weights_free(LayerWeights *w) {
-    free(w->Wq);free(w->Wk);free(w->Wv);free(w->Wo);
-    free(w->W1);free(w->W2);free(w->W3);
-    free(w->rms_att);free(w->rms_ffn);
-}
-static LayerAdam layer_adam_alloc(void) {
-    LayerAdam a;
-    a.Wq=adam_alloc(WQ_SZ); a.Wk=adam_alloc(WQ_SZ); a.Wv=adam_alloc(WQ_SZ); a.Wo=adam_alloc(WO_SZ);
-    a.W1=adam_alloc(W1_SZ); a.W2=adam_alloc(W2_SZ); a.W3=adam_alloc(W3_SZ);
-    a.rms_att=adam_alloc(DIM); a.rms_ffn=adam_alloc(DIM);
-    return a;
-}
-static void layer_adam_free(LayerAdam *a) {
-    adam_free(&a->Wq);adam_free(&a->Wk);adam_free(&a->Wv);adam_free(&a->Wo);
-    adam_free(&a->W1);adam_free(&a->W2);adam_free(&a->W3);
-    adam_free(&a->rms_att);adam_free(&a->rms_ffn);
-}
-static LayerActs layer_acts_alloc(void) {
-    LayerActs a;
-    a.layer_in=(float*)malloc(SEQ*DIM*4);
-    a.xnorm=(float*)malloc(SEQ*DIM*4); a.Q=(float*)malloc(SEQ*DIM*4);
-    a.K=(float*)malloc(SEQ*DIM*4); a.V=(float*)malloc(SEQ*DIM*4);
-    a.attn_out=(float*)malloc(SEQ*DIM*4); a.o_out=(float*)malloc(SEQ*DIM*4);
-    a.x2=(float*)malloc(SEQ*DIM*4); a.x2norm=(float*)malloc(SEQ*DIM*4);
-    a.h1=(float*)malloc(SEQ*HIDDEN*4); a.h3=(float*)malloc(SEQ*HIDDEN*4);
-    a.silu_out=(float*)malloc(SEQ*HIDDEN*4); a.ffn_out=(float*)malloc(SEQ*DIM*4);
-    return a;
-}
-static void layer_acts_free(LayerActs *a) {
-    free(a->layer_in);free(a->xnorm);free(a->Q);free(a->K);free(a->V);
-    free(a->attn_out);free(a->o_out);free(a->x2);free(a->x2norm);
-    free(a->h1);free(a->h3);free(a->silu_out);free(a->ffn_out);
-}
-static LayerGrads layer_grads_alloc(void) {
-    LayerGrads g;
-    g.Wq=(float*)calloc(WQ_SZ,4); g.Wk=(float*)calloc(WQ_SZ,4);
-    g.Wv=(float*)calloc(WQ_SZ,4); g.Wo=(float*)calloc(WO_SZ,4);
-    g.W1=(float*)calloc(W1_SZ,4); g.W2=(float*)calloc(W2_SZ,4); g.W3=(float*)calloc(W3_SZ,4);
-    g.rms_att=(float*)calloc(DIM,4); g.rms_ffn=(float*)calloc(DIM,4);
-    return g;
-}
-static void layer_grads_zero(LayerGrads *g) {
-    memset(g->Wq,0,WQ_SZ*4);memset(g->Wk,0,WQ_SZ*4);
-    memset(g->Wv,0,WQ_SZ*4);memset(g->Wo,0,WO_SZ*4);
-    memset(g->W1,0,W1_SZ*4);memset(g->W2,0,W2_SZ*4);memset(g->W3,0,W3_SZ*4);
-    memset(g->rms_att,0,DIM*4);memset(g->rms_ffn,0,DIM*4);
-}
-static void layer_grads_free(LayerGrads *g) {
-    free(g->Wq);free(g->Wk);free(g->Wv);free(g->Wo);
-    free(g->W1);free(g->W2);free(g->W3);
-    free(g->rms_att);free(g->rms_ffn);
-}
+// stories_config.h — Stories110M model config and structures
+#pragma once
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#import <Accelerate/Accelerate.h>
+#include <math.h>
+#include <unistd.h>
+#include <dispatch/dispatch.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "ane_compat.h"
+
+// Stories110M config
+#define DIM 768
+#define HIDDEN 2048
+#define HEADS 12
+#define HD (DIM/HEADS)
+#define SEQ 256
+#define NLAYERS 12
+#define VOCAB 32000
+#define ACCUM_STEPS 10
+#define MAX_COMPILES 100
+
+// Per compile: 5 weight-bearing kernels per layer + 1 classifier = 5*12+1 = 61
+// Plus 1 static (sdpaBwd2 per layer, no weights) = 12 more but those are weight-free
+// Actually sdpaBwd2 has no weights, compile once per layer
+// Weight-bearing: fwdAttn(1) + fwdFFN(1) + ffnBwd(1) + sdpaBwd1(1) + qkvBwd(1) = 5 per layer
+// 5 * 12 = 60 weight-bearing compiles per batch
+// With MAX_COMPILES=100, we get 1 batch of ACCUM_STEPS before restart
+#define KERNELS_PER_LAYER 5
+#define TOTAL_WEIGHT_KERNELS (KERNELS_PER_LAYER * NLAYERS)
+
+// Attention score channels for SDPA backward
+#define SCORE_CH (HEADS*SEQ)
+
+// Weight sizes per layer
+#define WQ_SZ (DIM*DIM)
+#define WO_SZ (DIM*DIM)
+#define W1_SZ (HIDDEN*DIM)
+#define W2_SZ (DIM*HIDDEN)
+#define W3_SZ (HIDDEN*DIM)
+#define LAYER_PARAMS (4*WQ_SZ + W1_SZ + W2_SZ + W3_SZ + 2*DIM)
+#define TOTAL_PARAMS (NLAYERS * LAYER_PARAMS + DIM + VOCAB*DIM)  // +rms_final+embed
+
+// Per-layer weight and optimizer state
+typedef struct {
+    float *Wq, *Wk, *Wv, *Wo;
+    float *W1, *W2, *W3;
+    float *rms_att, *rms_ffn;
+} LayerWeights;
+
+typedef struct {
+    float *m, *v;
+    size_t n;
+} AdamState;
+
+typedef struct {
+    AdamState Wq, Wk, Wv, Wo;
+    AdamState W1, W2, W3;
+    AdamState rms_att, rms_ffn;
+} LayerAdam;
+
+// Per-layer activation buffers (saved for backward)
+typedef struct {
+    float *layer_in;    // [DIM, SEQ] input to this layer (for rmsnorm1 bwd)
+    float *xnorm;      // [DIM, SEQ] rmsnorm1 output
+    float *Q, *K, *V;  // [DIM, SEQ] QKV projections
+    float *attn_out;    // [DIM, SEQ] attention output (before Wo)
+    float *o_out;       // [DIM, SEQ] Wo output
+    float *x2;          // [DIM, SEQ] residual after attn
+    float *x2norm;      // [DIM, SEQ] rmsnorm2 output
+    float *h1, *h3;     // [HIDDEN, SEQ] FFN intermediates
+    float *silu_out;    // [HIDDEN, SEQ] SiLU(h1)*h3
+    float *ffn_out;     // [DIM, SEQ] FFN output
+} LayerActs;
+
+// Per-layer gradient accumulators
+typedef struct {
+    float *Wq, *Wk, *Wv, *Wo;
+    float *W1, *W2, *W3;
+    float *rms_att, *rms_ffn;
+} LayerGrads;
+
+// ANE kernels per layer
+typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern;
+typedef struct {
+    Kern *fwdAttn, *fwdFFN, *ffnBwd, *sdpaBwd1, *sdpaBwd2, *qkvBwd;
+} LayerKernels;
+
+// Checkpoint header
+typedef struct {
+    int magic;          // 0x424C5A54 "BLZT"
+    int version;        // 2
+    int step, total_steps;
+    int n_layers, vocab_size, dim, hidden_dim, n_heads, seq_len;
+    float lr, loss;
+    double cum_compile, cum_train, cum_wall;
+    int cum_steps, cum_batches;
+    int adam_t;
+    int pad[3];         // alignment
+} CkptHdr;
+
+// llama2.c model file header
+typedef struct {
+    int dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len;
+} Llama2Config;
+
+// Globals
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static int g_compile_count = 0;
+
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+// Alloc helpers
+static AdamState adam_alloc(size_t n) { AdamState s; s.m=(float*)calloc(n,4); s.v=(float*)calloc(n,4); s.n=n; return s; }
+static void adam_free(AdamState *s) { free(s->m); free(s->v); }
+
+static LayerWeights layer_weights_alloc(void) {
+    LayerWeights w;
+    w.Wq=(float*)malloc(WQ_SZ*4); w.Wk=(float*)malloc(WQ_SZ*4);
+    w.Wv=(float*)malloc(WQ_SZ*4); w.Wo=(float*)malloc(WO_SZ*4);
+    w.W1=(float*)malloc(W1_SZ*4); w.W2=(float*)malloc(W2_SZ*4); w.W3=(float*)malloc(W3_SZ*4);
+    w.rms_att=(float*)malloc(DIM*4); w.rms_ffn=(float*)malloc(DIM*4);
+    return w;
+}
+static void layer_weights_free(LayerWeights *w) {
+    free(w->Wq);free(w->Wk);free(w->Wv);free(w->Wo);
+    free(w->W1);free(w->W2);free(w->W3);
+    free(w->rms_att);free(w->rms_ffn);
+}
+static LayerAdam layer_adam_alloc(void) {
+    LayerAdam a;
+    a.Wq=adam_alloc(WQ_SZ); a.Wk=adam_alloc(WQ_SZ); a.Wv=adam_alloc(WQ_SZ); a.Wo=adam_alloc(WO_SZ);
+    a.W1=adam_alloc(W1_SZ); a.W2=adam_alloc(W2_SZ); a.W3=adam_alloc(W3_SZ);
+    a.rms_att=adam_alloc(DIM); a.rms_ffn=adam_alloc(DIM);
+    return a;
+}
+static void layer_adam_free(LayerAdam *a) {
+    adam_free(&a->Wq);adam_free(&a->Wk);adam_free(&a->Wv);adam_free(&a->Wo);
+    adam_free(&a->W1);adam_free(&a->W2);adam_free(&a->W3);
+    adam_free(&a->rms_att);adam_free(&a->rms_ffn);
+}
+static LayerActs layer_acts_alloc(void) {
+    LayerActs a;
+    a.layer_in=(float*)malloc(SEQ*DIM*4);
+    a.xnorm=(float*)malloc(SEQ*DIM*4); a.Q=(float*)malloc(SEQ*DIM*4);
+    a.K=(float*)malloc(SEQ*DIM*4); a.V=(float*)malloc(SEQ*DIM*4);
+    a.attn_out=(float*)malloc(SEQ*DIM*4); a.o_out=(float*)malloc(SEQ*DIM*4);
+    a.x2=(float*)malloc(SEQ*DIM*4); a.x2norm=(float*)malloc(SEQ*DIM*4);
+    a.h1=(float*)malloc(SEQ*HIDDEN*4); a.h3=(float*)malloc(SEQ*HIDDEN*4);
+    a.silu_out=(float*)malloc(SEQ*HIDDEN*4); a.ffn_out=(float*)malloc(SEQ*DIM*4);
+    return a;
+}
+static void layer_acts_free(LayerActs *a) {
+    free(a->layer_in);free(a->xnorm);free(a->Q);free(a->K);free(a->V);
+    free(a->attn_out);free(a->o_out);free(a->x2);free(a->x2norm);
+    free(a->h1);free(a->h3);free(a->silu_out);free(a->ffn_out);
+}
+static LayerGrads layer_grads_alloc(void) {
+    LayerGrads g;
+    g.Wq=(float*)calloc(WQ_SZ,4); g.Wk=(float*)calloc(WQ_SZ,4);
+    g.Wv=(float*)calloc(WQ_SZ,4); g.Wo=(float*)calloc(WO_SZ,4);
+    g.W1=(float*)calloc(W1_SZ,4); g.W2=(float*)calloc(W2_SZ,4); g.W3=(float*)calloc(W3_SZ,4);
+    g.rms_att=(float*)calloc(DIM,4); g.rms_ffn=(float*)calloc(DIM,4);
+    return g;
+}
+static void layer_grads_zero(LayerGrads *g) {
+    memset(g->Wq,0,WQ_SZ*4);memset(g->Wk,0,WQ_SZ*4);
+    memset(g->Wv,0,WQ_SZ*4);memset(g->Wo,0,WO_SZ*4);
+    memset(g->W1,0,W1_SZ*4);memset(g->W2,0,W2_SZ*4);memset(g->W3,0,W3_SZ*4);
+    memset(g->rms_att,0,DIM*4);memset(g->rms_ffn,0,DIM*4);
+}
+static void layer_grads_free(LayerGrads *g) {
+    free(g->Wq);free(g->Wk);free(g->Wv);free(g->Wo);
+    free(g->W1);free(g->W2);free(g->W3);
+    free(g->rms_att);free(g->rms_ffn);
+}
diff --git a/training/stories_mil.h b/training/stories_mil.h
index dccca44..1ca063a 100644
--- a/training/stories_mil.h
+++ b/training/stories_mil.h
@@ -1,286 +1,286 @@
-// stories_mil.h — MIL program generators for ANE kernels
-// Same architecture as single-layer train_large.m but parameterized
-#pragma once
-#include "stories_io.h"
-
-#define MIL_HDR \
-    @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
-    "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
-    "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-#define CONV_CONST \
-    "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
-    "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
-    "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
-    "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
-    "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-
-// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm)
-static NSString *gen_sdpa_fwd_taps(void) {
-    float sc = 1.0f/sqrtf((float)HD);
-    float invd = 1.0f/(float)DIM;
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
-    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
-    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
-    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
-    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
-    [m appendString:@"        bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
-    [m appendString:@"        bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm)
-static NSString *gen_ffn_fwd_taps(void) {
-    float invd = 1.0f/(float)DIM;
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
-    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
-    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
-    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
-    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3)
-static NSString *gen_ffn_bwd(void) {
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendString:@"        tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
-    [m appendString:@"        fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// QKV backward: concat(dq,dk,dv) → dx
-static NSString *gen_qkvb(void) {
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// SDPA backward part 1 + Wo^T
-static NSString *gen_sdpa_bwd1(void) {
-    float sc = 1.0f/sqrtf((float)HD);
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
-    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
-    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
-    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// SDPA backward part 2: concat(probs,dp,Q,K) → concat(dQ,dK)
-static NSString *gen_sdpa_bwd2(void) {
-    float sc = 1.0f/sqrtf((float)HD);
-    int bwd2_in = 2*SCORE_CH + 2*DIM;
-    NSMutableString *m = [NSMutableString string];
-    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
-    [m appendString:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
-    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
-    [m appendString:@"    } -> (out);\n}\n"];
-    return m;
-}
-
-// Mask blob (causal mask [SEQ,SEQ])
-static NSData *g_mask_blob = nil;
-static NSData *get_mask_blob(void) {
-    if (!g_mask_blob) {
-        _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16));
-        for(int t=0;t<SEQ;t++) for(int t2=0;t2<SEQ;t2++)
-            mask[t*SEQ+t2] = (t2<=t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
-        g_mask_blob = build_blob_fp16(mask, SEQ*SEQ);
-        free(mask);
-    }
-    return g_mask_blob;
-}
+// stories_mil.h — MIL program generators for ANE kernels
+// Same architecture as single-layer train_large.m but parameterized
+#pragma once
+#include "stories_io.h"
+
+// MIL_HDR is now a function returning the header with the correct program version.
+// Use ane_mil_header() from ane_compat.h for runtime-detected version.
+// This macro is kept for backward compat but uses the detected platform.
+#define MIL_HDR ane_mil_header()
+#define CONV_CONST \
+    "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
+    "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
+    "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+
+// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm)
+static NSString *gen_sdpa_fwd_taps(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    float invd = 1.0f/(float)DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
+    [m appendString:@"        bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm)
+static NSString *gen_ffn_fwd_taps(void) {
+    float invd = 1.0f/(float)DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3)
+static NSString *gen_ffn_bwd(void) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), DIM+2*HIDDEN, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendString:@"        tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
+    [m appendString:@"        fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// QKV backward: concat(dq,dk,dv) → dx
+static NSString *gen_qkvb(void) {
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), 3*DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// SDPA backward part 1 + Wo^T
+static NSString *gen_sdpa_bwd1(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), 4*DIM, SEQ];
+    [m appendString:@CONV_CONST];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
+    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// SDPA backward part 2: concat(probs,dp,Q,K) → concat(dQ,dK)
+static NSString *gen_sdpa_bwd2(void) {
+    float sc = 1.0f/sqrtf((float)HD);
+    int bwd2_in = 2*SCORE_CH + 2*DIM;
+    NSMutableString *m = [NSMutableString string];
+    [m appendString:MIL_HDR];
+    [m appendFormat:@"    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), bwd2_in, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
+    [m appendString:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
+    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
+    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
+    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
+    [m appendString:@"    } -> (out);\n}\n"];
+    return m;
+}
+
+// Mask blob (causal mask [SEQ,SEQ])
+static NSData *g_mask_blob = nil;
+static NSData *get_mask_blob(void) {
+    if (!g_mask_blob) {
+        _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16));
+        for(int t=0;t<SEQ;t++) for(int t2=0;t2<SEQ;t2++)
+            mask[t*SEQ+t2] = (t2<=t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+        g_mask_blob = build_blob_fp16(mask, SEQ*SEQ);
+        free(mask);
+    }
+    return g_mask_blob;
+}
diff --git a/training/test_ane_advanced.m b/training/test_ane_advanced.m
index 07e9038..a6b2753 100644
--- a/training/test_ane_advanced.m
+++ b/training/test_ane_advanced.m
@@ -1,245 +1,248 @@
-// test_ane_advanced.m — Probe advanced ANE interfaces
-// SharedEvents, weightsBuffer, procedureIndex, VirtualClient, ChainingRequest
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-#include <math.h>
-
-static mach_timebase_info_data_t g_tb;
-static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
-
-static void dump_class(const char *name) {
-    Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
-    if (!cls) { printf("  %s: NOT FOUND\n", name); return; }
-    printf("\n=== %s ===\n", name);
-    unsigned int count;
-    Method *methods = class_copyMethodList(object_getClass(cls), &count);
-    if (count) printf("  Class methods:\n");
-    for (unsigned int i = 0; i < count; i++) {
-        SEL s = method_getName(methods[i]);
-        const char *enc = method_getTypeEncoding(methods[i]);
-        printf("    + %s  [%s]\n", sel_getName(s), enc ? enc : "?");
-    }
-    free(methods);
-    methods = class_copyMethodList(cls, &count);
-    if (count) printf("  Instance methods:\n");
-    for (unsigned int i = 0; i < count; i++) {
-        SEL s = method_getName(methods[i]);
-        const char *enc = method_getTypeEncoding(methods[i]);
-        printf("    - %s  [%s]\n", sel_getName(s), enc ? enc : "?");
-    }
-    free(methods);
-    unsigned int pcount;
-    objc_property_t *props = class_copyPropertyList(cls, &pcount);
-    if (pcount) printf("  Properties:\n");
-    for (unsigned int i = 0; i < pcount; i++) {
-        const char *pname = property_getName(props[i]);
-        const char *pattr = property_getAttributes(props[i]);
-        printf("    @property %s  [%s]\n", pname, pattr ? pattr : "?");
-    }
-    free(props);
-}
-
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-int main() {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        mach_timebase_info(&g_tb);
-        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-
-        printf("=== ANE Advanced Interface Probe ===\n");
-
-        // === Part 1: Event/Sync classes ===
-        printf("\n--- Part 1: Event/Sync Classes ---\n");
-        dump_class("_ANESharedEvents");
-        dump_class("_ANESharedSignalEvent");
-        dump_class("_ANESharedWaitEvent");
-        dump_class("_ANEEvent");
-        dump_class("_ANEFenceEvent");
-
-        const char *event_classes[] = {
-            "_ANESharedEvents", "_ANESharedSignalEvent", "_ANESharedWaitEvent",
-            "_ANEEvent", "_ANEFenceEvent", NULL
-        };
-        for (int i = 0; event_classes[i]; i++) {
-            Class cls = NSClassFromString([NSString stringWithUTF8String:event_classes[i]]);
-            if (!cls) continue;
-            @try {
-                id obj = [[cls alloc] init];
-                printf("  %s alloc/init: %s\n", event_classes[i],
-                       obj ? [[obj description] UTF8String] : "nil");
-            } @catch (NSException *ex) {
-                printf("  %s alloc/init: EXCEPTION: %s\n", event_classes[i], [[ex reason] UTF8String]);
-            }
-        }
-
-        // === Part 2: VirtualClient and ChainingRequest ===
-        printf("\n--- Part 2: VirtualClient / ChainingRequest ---\n");
-        dump_class("_ANEVirtualClient");
-        dump_class("_ANEChainingRequest");
-        dump_class("_ANEMultiRequest");
-        dump_class("_ANEBatchRequest");
-
-        // === Part 3: Compile working kernel for weightsBuffer + procedureIndex tests ===
-        printf("\n--- Part 3: weightsBuffer IOSurface test ---\n");
-        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
-        Class g_AR = NSClassFromString(@"_ANERequest");
-        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-
-        int CH = 64, SP = 32;
-        _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16));
-        for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f;
-        int ws = CH*CH*2, tot = 128+ws;
-        uint8_t *blob = (uint8_t*)calloc(tot,1);
-        blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
-        *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
-        memcpy(blob+128, w, ws);
-        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
-
-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n"
-            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n"
-            "{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
-            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-            "[name=string(\"conv\")];\n"
-            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-            "    } -> (y);\n"
-            "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
-
-        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
-        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
-            md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil);
-        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-        NSFileManager *fm = [NSFileManager defaultManager];
-        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
-            withIntermediateDirectories:YES attributes:nil error:nil];
-        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-        [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
-
-        NSError *e = nil;
-        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
-        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-
-        int ioBytes = CH * SP * 4;
-        IOSurfaceRef ioIn = make_surface(ioBytes);
-        IOSurfaceRef ioOut = make_surface(ioBytes);
-
-        IOSurfaceLock(ioIn, 0, NULL);
-        float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-        for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
-        IOSurfaceUnlock(ioIn, 0, NULL);
-
-        // Baseline eval
-        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
-        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
-        id req0 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
-        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-            mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req0, &e);
-        printf("  Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL");
-
-        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-        float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
-        float baseline_0 = out0[0], baseline_1 = out0[1];
-        printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
-        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
-
-        // Test weightsBuffer: IOSurface with 3x identity weights
-        printf("\n  Testing weightsBuffer IOSurface...\n");
-        _Float16 *w3 = (_Float16*)calloc(CH*CH, sizeof(_Float16));
-        for (int i = 0; i < CH; i++) w3[i*CH+i] = (_Float16)3.0f;
-        IOSurfaceRef ioW = make_surface(ws);
-        IOSurfaceLock(ioW, 0, NULL);
-        memcpy(IOSurfaceGetBaseAddress(ioW), w3, ws);
-        IOSurfaceUnlock(ioW, 0, NULL);
-        free(w3);
-        id wW = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioW);
-
-        wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
-        wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
-        id req_wb = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-            @[wI], @[@0], @[wO], @[@0], wW, nil, @0);
-        printf("  Request with weightsBuffer: %s\n", req_wb ? "created" : "nil");
-
-        if (req_wb) {
-            ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-                mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_wb, &e);
-            printf("  Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL");
-            if (ok) {
-                IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-                float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
-                printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
-                bool changed = fabsf(outW[0] - baseline_0) > 0.001f;
-                bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f;
-                printf("  weightsBuffer: output %s", changed ? "CHANGED" : "unchanged");
-                if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected");
-                printf("\n");
-                IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
-            }
-        }
-        CFRelease(ioW);
-
-        // === Part 4: procedureIndex sweep ===
-        printf("\n--- Part 4: procedureIndex sweep (0-15) ---\n");
-        for (int pi = 0; pi < 16; pi++) {
-            wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
-            wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
-            id req_p = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-                @[wI], @[@0], @[wO], @[@0], nil, nil, @(pi));
-            if (!req_p) { printf("  procIdx %2d: request=nil\n", pi); continue; }
-            ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-                mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_p, &e);
-            printf("  procIdx %2d: %s%s\n", pi, ok ? "OK" : "FAIL",
-                   !ok && e ? [NSString stringWithFormat:@" (%@)", [e localizedDescription]].UTF8String : "");
-        }
-
-        // === Part 5: Scan all ANE classes ===
-        printf("\n--- Part 5: All ANE-prefixed classes ---\n");
-        unsigned int classCount;
-        Class *allClasses = objc_copyClassList(&classCount);
-        for (unsigned int i = 0; i < classCount; i++) {
-            const char *name = class_getName(allClasses[i]);
-            if (strstr(name, "ANE") || strstr(name, "ane")) {
-                printf("  %s\n", name);
-            }
-        }
-        free(allClasses);
-        free(w);
-
-        // Cleanup
-        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-        [fm removeItemAtPath:td error:nil];
-        CFRelease(ioIn); CFRelease(ioOut);
-
-        printf("\nDone.\n");
-    }
-    return 0;
-}
+// test_ane_advanced.m — Probe advanced ANE interfaces
+// SharedEvents, weightsBuffer, procedureIndex, VirtualClient, ChainingRequest
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+#include "ane_compat.h"
+
+static mach_timebase_info_data_t g_tb;
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+static void dump_class(const char *name) {
+    Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
+    if (!cls) { printf("  %s: NOT FOUND\n", name); return; }
+    printf("\n=== %s ===\n", name);
+    unsigned int count;
+    Method *methods = class_copyMethodList(object_getClass(cls), &count);
+    if (count) printf("  Class methods:\n");
+    for (unsigned int i = 0; i < count; i++) {
+        SEL s = method_getName(methods[i]);
+        const char *enc = method_getTypeEncoding(methods[i]);
+        printf("    + %s  [%s]\n", sel_getName(s), enc ? enc : "?");
+    }
+    free(methods);
+    methods = class_copyMethodList(cls, &count);
+    if (count) printf("  Instance methods:\n");
+    for (unsigned int i = 0; i < count; i++) {
+        SEL s = method_getName(methods[i]);
+        const char *enc = method_getTypeEncoding(methods[i]);
+        printf("    - %s  [%s]\n", sel_getName(s), enc ? enc : "?");
+    }
+    free(methods);
+    unsigned int pcount;
+    objc_property_t *props = class_copyPropertyList(cls, &pcount);
+    if (pcount) printf("  Properties:\n");
+    for (unsigned int i = 0; i < pcount; i++) {
+        const char *pname = property_getName(props[i]);
+        const char *pattr = property_getAttributes(props[i]);
+        printf("    @property %s  [%s]\n", pname, pattr ? pattr : "?");
+    }
+    free(props);
+}
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        mach_timebase_info(&g_tb);
+        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+        ane_detect_platform();
+        ane_print_platform();
+
+        printf("=== ANE Advanced Interface Probe ===\n");
+
+        // === Part 1: Event/Sync classes ===
+        printf("\n--- Part 1: Event/Sync Classes ---\n");
+        dump_class("_ANESharedEvents");
+        dump_class("_ANESharedSignalEvent");
+        dump_class("_ANESharedWaitEvent");
+        dump_class("_ANEEvent");
+        dump_class("_ANEFenceEvent");
+
+        const char *event_classes[] = {
+            "_ANESharedEvents", "_ANESharedSignalEvent", "_ANESharedWaitEvent",
+            "_ANEEvent", "_ANEFenceEvent", NULL
+        };
+        for (int i = 0; event_classes[i]; i++) {
+            Class cls = NSClassFromString([NSString stringWithUTF8String:event_classes[i]]);
+            if (!cls) continue;
+            @try {
+                id obj = [[cls alloc] init];
+                printf("  %s alloc/init: %s\n", event_classes[i],
+                       obj ? [[obj description] UTF8String] : "nil");
+            } @catch (NSException *ex) {
+                printf("  %s alloc/init: EXCEPTION: %s\n", event_classes[i], [[ex reason] UTF8String]);
+            }
+        }
+
+        // === Part 2: VirtualClient and ChainingRequest ===
+        printf("\n--- Part 2: VirtualClient / ChainingRequest ---\n");
+        dump_class("_ANEVirtualClient");
+        dump_class("_ANEChainingRequest");
+        dump_class("_ANEMultiRequest");
+        dump_class("_ANEBatchRequest");
+
+        // === Part 3: Compile working kernel for weightsBuffer + procedureIndex tests ===
+        printf("\n--- Part 3: weightsBuffer IOSurface test ---\n");
+        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
+        Class g_AR = NSClassFromString(@"_ANERequest");
+        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+
+        int CH = 64, SP = 32;
+        _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16));
+        for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f;
+        int ws = CH*CH*2, tot = 128+ws;
+        uint8_t *blob = (uint8_t*)calloc(tot,1);
+        blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
+        *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
+        memcpy(blob+128, w, ws);
+        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
+
+        NSString *mil = [NSString stringWithFormat:
+            @"program(%s)\n"
+            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+            "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"\"}})]\n"
+            "{\n"
+            "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
+            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
+            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
+            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
+            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
+            "[name=string(\"conv\")];\n"
+            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
+            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
+            "    } -> (y);\n"
+            "}\n", g_ane_platform.mil_program, ane_mil_target(), CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+
+        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
+            md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil);
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        NSFileManager *fm = [NSFileManager defaultManager];
+        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+            withIntermediateDirectories:YES attributes:nil error:nil];
+        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+
+        NSError *e = nil;
+        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+
+        int ioBytes = CH * SP * 4;
+        IOSurfaceRef ioIn = make_surface(ioBytes);
+        IOSurfaceRef ioOut = make_surface(ioBytes);
+
+        IOSurfaceLock(ioIn, 0, NULL);
+        float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+        for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
+        IOSurfaceUnlock(ioIn, 0, NULL);
+
+        // Baseline eval
+        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
+        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
+        id req0 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+            mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req0, &e);
+        printf("  Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL");
+
+        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
+        float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
+        float baseline_0 = out0[0], baseline_1 = out0[1];
+        printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
+        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
+
+        // Test weightsBuffer: IOSurface with 3x identity weights
+        printf("\n  Testing weightsBuffer IOSurface...\n");
+        _Float16 *w3 = (_Float16*)calloc(CH*CH, sizeof(_Float16));
+        for (int i = 0; i < CH; i++) w3[i*CH+i] = (_Float16)3.0f;
+        IOSurfaceRef ioW = make_surface(ws);
+        IOSurfaceLock(ioW, 0, NULL);
+        memcpy(IOSurfaceGetBaseAddress(ioW), w3, ws);
+        IOSurfaceUnlock(ioW, 0, NULL);
+        free(w3);
+        id wW = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioW);
+
+        wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
+        wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
+        id req_wb = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wI], @[@0], @[wO], @[@0], wW, nil, @0);
+        printf("  Request with weightsBuffer: %s\n", req_wb ? "created" : "nil");
+
+        if (req_wb) {
+            ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_wb, &e);
+            printf("  Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL");
+            if (ok) {
+                IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
+                float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
+                printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
+                bool changed = fabsf(outW[0] - baseline_0) > 0.001f;
+                bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f;
+                printf("  weightsBuffer: output %s", changed ? "CHANGED" : "unchanged");
+                if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected");
+                printf("\n");
+                IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
+            }
+        }
+        CFRelease(ioW);
+
+        // === Part 4: procedureIndex sweep ===
+        printf("\n--- Part 4: procedureIndex sweep (0-15) ---\n");
+        for (int pi = 0; pi < 16; pi++) {
+            wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
+            wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
+            id req_p = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+                @[wI], @[@0], @[wO], @[@0], nil, nil, @(pi));
+            if (!req_p) { printf("  procIdx %2d: request=nil\n", pi); continue; }
+            ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_p, &e);
+            printf("  procIdx %2d: %s%s\n", pi, ok ? "OK" : "FAIL",
+                   !ok && e ? [NSString stringWithFormat:@" (%@)", [e localizedDescription]].UTF8String : "");
+        }
+
+        // === Part 5: Scan all ANE classes ===
+        printf("\n--- Part 5: All ANE-prefixed classes ---\n");
+        unsigned int classCount;
+        Class *allClasses = objc_copyClassList(&classCount);
+        for (unsigned int i = 0; i < classCount; i++) {
+            const char *name = class_getName(allClasses[i]);
+            if (strstr(name, "ANE") || strstr(name, "ane")) {
+                printf("  %s\n", name);
+            }
+        }
+        free(allClasses);
+        free(w);
+
+        // Cleanup
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+        [fm removeItemAtPath:td error:nil];
+        CFRelease(ioIn); CFRelease(ioOut);
+
+        printf("\nDone.\n");
+    }
+    return 0;
+}
diff --git a/training/test_ane_causal_attn.m b/training/test_ane_causal_attn.m
index cb9b761..55c381b 100644
--- a/training/test_ane_causal_attn.m
+++ b/training/test_ane_causal_attn.m
@@ -1,295 +1,301 @@
-// Decomposed causal attention: Q@K^T on ANE, mask+softmax on CPU, scores@V on ANE
-// This gives us causal masking with ANE acceleration for the matmuls
-#import <Foundation/Foundation.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-#include <math.h>
-
-#define HEADS 12
-#define HD 64
-#define SEQ 64
-
-static Class g_D, g_I, g_AR, g_AIO;
-static mach_timebase_info_data_t g_tb;
-static void ane_init(void) {
-    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_I  = NSClassFromString(@"_ANEInMemoryModel");
-    g_AR = NSClassFromString(@"_ANERequest");
-    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-}
-static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-typedef struct { id model; NSString *td; } Kern;
-
-static Kern compile_mil(NSString *mil) {
-    Kern k = {nil, nil};
-    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
-    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, @{}, nil);
-    if (!desc) { printf("desc=NULL\n"); return k; }
-    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-    [[NSFileManager defaultManager] createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil];
-    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-    NSError *e = nil;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
-        printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:"");
-        [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k;
-    }
-    ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-    k.model = mdl; k.td = td;
-    return k;
-}
-
-static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) {
-    NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array];
-    for (int i = 0; i < nin; i++) {
-        [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])];
-        [inIdx addObject:@(i)];
-    }
-    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out);
-    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-        inArr, inIdx, @[wO], @[@0], nil, nil, @0);
-    NSError *e = nil;
-    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-}
-
-static void cleanup_kern(Kern *k) {
-    if (!k->model) return;
-    NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
-    [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil];
-}
-
-int main() {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        ane_init();
-        mach_timebase_info(&g_tb);
-
-        // === Approach 1: Non-causal SDPA (baseline) ===
-        printf("=== Non-causal SDPA (baseline) ===\n");
-        NSString *sdpa_mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
-            "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
-            "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-            "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
-            "    } -> (att);\n}\n",
-            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
-        Kern kSDPA = compile_mil(sdpa_mil);
-        printf("SDPA compile: %s\n", kSDPA.model ? "OK" : "FAIL");
-
-        // === Approach 2: Decomposed causal via matmul ops ===
-        // Step 1: Q @ K^T → scores [1, HEADS, SEQ, SEQ]
-        // MIL matmul: matmul(x=Q, y=K, transpose_y=true)
-        // Q shape: [1, HEADS, SEQ, HD], K shape: [1, HEADS, SEQ, HD]
-        // scores = Q @ K^T → [1, HEADS, SEQ, SEQ]
-        printf("\n=== Decomposed causal attention ===\n");
-        NSString *qkt_mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
-            "tensor<fp16, [1, %d, %d, %d]> k) {\n"
-            "        tensor<fp16, [1, %d, %d, %d]> scores = matmul("
-            "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n"
-            "    } -> (scores);\n}\n",
-            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ];
-        Kern kQKT = compile_mil(qkt_mil);
-        printf("Q@K^T compile: %s\n", kQKT.model ? "OK" : "FAIL");
-
-        // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD]
-        NSString *sv_mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> s, "
-            "tensor<fp16, [1, %d, %d, %d]> v) {\n"
-            "        tensor<fp16, [1, %d, %d, %d]> out = matmul("
-            "x = s, y = v)[name = string(\"sv\")];\n"
-            "    } -> (out);\n}\n",
-            HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD];
-        Kern kSV = compile_mil(sv_mil);
-        printf("scores@V compile: %s\n", kSV.model ? "OK" : "FAIL");
-
-        if (!kSDPA.model || !kQKT.model || !kSV.model) {
-            printf("Some kernels failed to compile, aborting\n");
-            goto done;
-        }
-
-        // Generate test data
-        srand48(42);
-        int total_qkv = HEADS * SEQ * HD;
-        _Float16 *Q = (_Float16*)malloc(total_qkv * 2);
-        _Float16 *K = (_Float16*)malloc(total_qkv * 2);
-        _Float16 *V = (_Float16*)malloc(total_qkv * 2);
-        for (int i = 0; i < total_qkv; i++) {
-            Q[i] = (_Float16)(0.5f * (2*drand48()-1));
-            K[i] = (_Float16)(0.5f * (2*drand48()-1));
-            V[i] = (_Float16)(0.5f * (2*drand48()-1));
-        }
-
-        // IOSurfaces for Q, K, V
-        size_t qkv_bytes = total_qkv * 2;
-        IOSurfaceRef ioQ = make_surface(qkv_bytes), ioK = make_surface(qkv_bytes), ioV = make_surface(qkv_bytes);
-        IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, qkv_bytes); IOSurfaceUnlock(ioQ, 0, NULL);
-        IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, qkv_bytes); IOSurfaceUnlock(ioK, 0, NULL);
-        IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, qkv_bytes); IOSurfaceUnlock(ioV, 0, NULL);
-
-        // Scores IOSurface: [1, HEADS, SEQ, SEQ]
-        int total_scores = HEADS * SEQ * SEQ;
-        size_t scores_bytes = total_scores * 2;
-        IOSurfaceRef ioScores = make_surface(scores_bytes);
-        IOSurfaceRef ioOut_sdpa = make_surface(qkv_bytes);
-        IOSurfaceRef ioOut_decomp = make_surface(qkv_bytes);
-
-        // === Run non-causal SDPA ===
-        {
-            IOSurfaceRef ins[] = {ioQ, ioK, ioV};
-            if (!ane_eval(&kSDPA, ins, 3, ioOut_sdpa)) { printf("SDPA eval FAIL\n"); goto done; }
-        }
-
-        // === Run decomposed causal ===
-        // Step 1: Q@K^T on ANE
-        {
-            IOSurfaceRef ins[] = {ioQ, ioK};
-            if (!ane_eval(&kQKT, ins, 2, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; }
-        }
-
-        // Step 2: Scale + causal mask + softmax on CPU
-        {
-            IOSurfaceLock(ioScores, 0, NULL);
-            _Float16 *scores = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
-            float scale = 1.0f / sqrtf((float)HD);
-            for (int h = 0; h < HEADS; h++) {
-                for (int t = 0; t < SEQ; t++) {
-                    // Apply scale, causal mask, and softmax
-                    float row[SEQ], maxs = -1e30f;
-                    for (int t2 = 0; t2 < SEQ; t2++) {
-                        float s = (float)scores[h*SEQ*SEQ + t*SEQ + t2] * scale;
-                        if (t2 > t) s = -1e30f;  // causal mask
-                        row[t2] = s;
-                        if (s > maxs) maxs = s;
-                    }
-                    float sum = 0;
-                    for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2] - maxs); sum += row[t2]; }
-                    for (int t2 = 0; t2 < SEQ; t2++)
-                        scores[h*SEQ*SEQ + t*SEQ + t2] = (_Float16)(row[t2] / sum);
-                }
-            }
-            IOSurfaceUnlock(ioScores, 0, NULL);
-        }
-
-        // Step 3: softmax_scores @ V on ANE
-        {
-            IOSurfaceRef ins[] = {ioScores, ioV};
-            if (!ane_eval(&kSV, ins, 2, ioOut_decomp)) { printf("scores@V eval FAIL\n"); goto done; }
-        }
-
-        // === Verify decomposed causal ===
-        {
-            float scale = 1.0f / sqrtf((float)HD);
-            IOSurfaceLock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL);
-            _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut_decomp);
-            float maxdiff = 0;
-            for (int h = 0; h < HEADS; h++)
-                for (int t = 0; t < SEQ; t++) {
-                    float scores[SEQ], maxs = -1e30f;
-                    for (int t2 = 0; t2 <= t; t2++) {
-                        float s = 0;
-                        for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
-                        s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
-                    }
-                    float sum = 0;
-                    for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
-                    for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum;
-                    for (int d = 0; d < HD; d++) {
-                        float ref = 0;
-                        for (int t2 = 0; t2 <= t; t2++) ref += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
-                        float diff = fabsf((float)out[h*SEQ*HD+t*HD+d] - ref);
-                        if(diff>maxdiff) maxdiff=diff;
-                    }
-                }
-            IOSurfaceUnlock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL);
-            printf("\nDecomposed causal max diff vs CPU ref: %.6f\n", maxdiff);
-        }
-
-        // === Benchmark: SDPA vs decomposed ===
-        printf("\n=== Benchmarks ===\n");
-        int N = 500;
-        {
-            IOSurfaceRef ins[] = {ioQ, ioK, ioV};
-            // Warmup
-            for (int i = 0; i < 10; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa);
-            uint64_t t0 = mach_absolute_time();
-            for (int i = 0; i < N; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa);
-            double ms = tb_ms(mach_absolute_time() - t0);
-            double flops = 4.0 * HEADS * SEQ * SEQ * HD;
-            printf("SDPA (non-causal): %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6);
-        }
-        {
-            // Decomposed: QKT + CPU softmax + SV
-            // Warmup
-            for (int i = 0; i < 10; i++) {
-                IOSurfaceRef ins1[] = {ioQ, ioK};
-                ane_eval(&kQKT, ins1, 2, ioScores);
-                // Skip CPU softmax in benchmark for ANE-only timing
-                IOSurfaceRef ins2[] = {ioScores, ioV};
-                ane_eval(&kSV, ins2, 2, ioOut_decomp);
-            }
-            uint64_t t0 = mach_absolute_time();
-            for (int i = 0; i < N; i++) {
-                IOSurfaceRef ins1[] = {ioQ, ioK};
-                ane_eval(&kQKT, ins1, 2, ioScores);
-                // CPU softmax + causal mask
-                IOSurfaceLock(ioScores, 0, NULL);
-                _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
-                float scale = 1.0f / sqrtf((float)HD);
-                for (int h = 0; h < HEADS; h++)
-                    for (int t = 0; t < SEQ; t++) {
-                        float row[SEQ], maxs = -1e30f;
-                        for (int t2 = 0; t2 < SEQ; t2++) {
-                            float s = (float)sc[h*SEQ*SEQ+t*SEQ+t2] * scale;
-                            if (t2 > t) s = -1e30f;
-                            row[t2] = s; if(s>maxs) maxs=s;
-                        }
-                        float sum = 0;
-                        for (int t2 = 0; t2 < SEQ; t2++) { row[t2]=expf(row[t2]-maxs); sum+=row[t2]; }
-                        for (int t2 = 0; t2 < SEQ; t2++)
-                            sc[h*SEQ*SEQ+t*SEQ+t2] = (_Float16)(row[t2]/sum);
-                    }
-                IOSurfaceUnlock(ioScores, 0, NULL);
-                IOSurfaceRef ins2[] = {ioScores, ioV};
-                ane_eval(&kSV, ins2, 2, ioOut_decomp);
-            }
-            double ms = tb_ms(mach_absolute_time() - t0);
-            double flops = 4.0 * HEADS * SEQ * SEQ * HD;
-            printf("Decomposed causal: %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6);
-        }
-
-        CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV);
-        CFRelease(ioScores); CFRelease(ioOut_sdpa); CFRelease(ioOut_decomp);
-        free(Q); free(K); free(V);
-
-        done:
-        cleanup_kern(&kSDPA);
-        cleanup_kern(&kQKT);
-        cleanup_kern(&kSV);
-        printf("\nDONE\n");
-    }
-    return 0;
-}
+// Decomposed causal attention: Q@K^T on ANE, mask+softmax on CPU, scores@V on ANE
+// This gives us causal masking with ANE acceleration for the matmuls
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+#include "ane_compat.h"
+
+#define HEADS 12
+#define HD 64
+#define SEQ 64
+
+static Class g_D, g_I, g_AR, g_AIO;
+static mach_timebase_info_data_t g_tb;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+typedef struct { id model; NSString *td; } Kern;
+
+static Kern compile_mil(NSString *mil) {
+    Kern k = {nil, nil};
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, @{}, nil);
+    if (!desc) { printf("desc=NULL\n"); return k; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:"");
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k;
+    }
+    ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+    k.model = mdl; k.td = td;
+    return k;
+}
+
+static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) {
+    NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array];
+    for (int i = 0; i < nin; i++) {
+        [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])];
+        [inIdx addObject:@(i)];
+    }
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        inArr, inIdx, @[wO], @[@0], nil, nil, @0);
+    NSError *e = nil;
+    return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+}
+
+static void cleanup_kern(Kern *k) {
+    if (!k->model) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        ane_detect_platform();
+        ane_print_platform();
+        mach_timebase_info(&g_tb);
+
+        // === Approach 1: Non-causal SDPA (baseline) ===
+        printf("=== Non-causal SDPA (baseline) ===\n");
+        NSString *sdpa_mil = [NSString stringWithFormat:
+            @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+            "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"\"}})]\n{\n"
+            "    func main<%s>(tensor<fp16, [1, %d, %d, %d]> q, "
+            "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+            "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+            "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
+            "    } -> (att);\n}\n",
+            g_ane_platform.mil_program, ane_mil_target(),
+            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
+        Kern kSDPA = compile_mil(sdpa_mil);
+        printf("SDPA compile: %s\n", kSDPA.model ? "OK" : "FAIL");
+
+        // === Approach 2: Decomposed causal via matmul ops ===
+        // Step 1: Q @ K^T → scores [1, HEADS, SEQ, SEQ]
+        // MIL matmul: matmul(x=Q, y=K, transpose_y=true)
+        // Q shape: [1, HEADS, SEQ, HD], K shape: [1, HEADS, SEQ, HD]
+        // scores = Q @ K^T → [1, HEADS, SEQ, SEQ]
+        printf("\n=== Decomposed causal attention ===\n");
+        NSString *qkt_mil = [NSString stringWithFormat:
+            @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+            "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"\"}})]\n{\n"
+            "    func main<%s>(tensor<fp16, [1, %d, %d, %d]> q, "
+            "tensor<fp16, [1, %d, %d, %d]> k) {\n"
+            "        tensor<fp16, [1, %d, %d, %d]> scores = matmul("
+            "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n"
+            "    } -> (scores);\n}\n",
+            g_ane_platform.mil_program, ane_mil_target(),
+            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ];
+        Kern kQKT = compile_mil(qkt_mil);
+        printf("Q@K^T compile: %s\n", kQKT.model ? "OK" : "FAIL");
+
+        // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD]
+        NSString *sv_mil = [NSString stringWithFormat:
+            @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+            "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"\"}})]\n{\n"
+            "    func main<%s>(tensor<fp16, [1, %d, %d, %d]> s, "
+            "tensor<fp16, [1, %d, %d, %d]> v) {\n"
+            "        tensor<fp16, [1, %d, %d, %d]> out = matmul("
+            "x = s, y = v)[name = string(\"sv\")];\n"
+            "    } -> (out);\n}\n",
+            g_ane_platform.mil_program, ane_mil_target(),
+            HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD];
+        Kern kSV = compile_mil(sv_mil);
+        printf("scores@V compile: %s\n", kSV.model ? "OK" : "FAIL");
+
+        if (!kSDPA.model || !kQKT.model || !kSV.model) {
+            printf("Some kernels failed to compile, aborting\n");
+            goto done;
+        }
+
+        // Generate test data
+        srand48(42);
+        int total_qkv = HEADS * SEQ * HD;
+        _Float16 *Q = (_Float16*)malloc(total_qkv * 2);
+        _Float16 *K = (_Float16*)malloc(total_qkv * 2);
+        _Float16 *V = (_Float16*)malloc(total_qkv * 2);
+        for (int i = 0; i < total_qkv; i++) {
+            Q[i] = (_Float16)(0.5f * (2*drand48()-1));
+            K[i] = (_Float16)(0.5f * (2*drand48()-1));
+            V[i] = (_Float16)(0.5f * (2*drand48()-1));
+        }
+
+        // IOSurfaces for Q, K, V
+        size_t qkv_bytes = total_qkv * 2;
+        IOSurfaceRef ioQ = make_surface(qkv_bytes), ioK = make_surface(qkv_bytes), ioV = make_surface(qkv_bytes);
+        IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, qkv_bytes); IOSurfaceUnlock(ioQ, 0, NULL);
+        IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, qkv_bytes); IOSurfaceUnlock(ioK, 0, NULL);
+        IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, qkv_bytes); IOSurfaceUnlock(ioV, 0, NULL);
+
+        // Scores IOSurface: [1, HEADS, SEQ, SEQ]
+        int total_scores = HEADS * SEQ * SEQ;
+        size_t scores_bytes = total_scores * 2;
+        IOSurfaceRef ioScores = make_surface(scores_bytes);
+        IOSurfaceRef ioOut_sdpa = make_surface(qkv_bytes);
+        IOSurfaceRef ioOut_decomp = make_surface(qkv_bytes);
+
+        // === Run non-causal SDPA ===
+        {
+            IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+            if (!ane_eval(&kSDPA, ins, 3, ioOut_sdpa)) { printf("SDPA eval FAIL\n"); goto done; }
+        }
+
+        // === Run decomposed causal ===
+        // Step 1: Q@K^T on ANE
+        {
+            IOSurfaceRef ins[] = {ioQ, ioK};
+            if (!ane_eval(&kQKT, ins, 2, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; }
+        }
+
+        // Step 2: Scale + causal mask + softmax on CPU
+        {
+            IOSurfaceLock(ioScores, 0, NULL);
+            _Float16 *scores = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
+            float scale = 1.0f / sqrtf((float)HD);
+            for (int h = 0; h < HEADS; h++) {
+                for (int t = 0; t < SEQ; t++) {
+                    // Apply scale, causal mask, and softmax
+                    float row[SEQ], maxs = -1e30f;
+                    for (int t2 = 0; t2 < SEQ; t2++) {
+                        float s = (float)scores[h*SEQ*SEQ + t*SEQ + t2] * scale;
+                        if (t2 > t) s = -1e30f;  // causal mask
+                        row[t2] = s;
+                        if (s > maxs) maxs = s;
+                    }
+                    float sum = 0;
+                    for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2] - maxs); sum += row[t2]; }
+                    for (int t2 = 0; t2 < SEQ; t2++)
+                        scores[h*SEQ*SEQ + t*SEQ + t2] = (_Float16)(row[t2] / sum);
+                }
+            }
+            IOSurfaceUnlock(ioScores, 0, NULL);
+        }
+
+        // Step 3: softmax_scores @ V on ANE
+        {
+            IOSurfaceRef ins[] = {ioScores, ioV};
+            if (!ane_eval(&kSV, ins, 2, ioOut_decomp)) { printf("scores@V eval FAIL\n"); goto done; }
+        }
+
+        // === Verify decomposed causal ===
+        {
+            float scale = 1.0f / sqrtf((float)HD);
+            IOSurfaceLock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL);
+            _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut_decomp);
+            float maxdiff = 0;
+            for (int h = 0; h < HEADS; h++)
+                for (int t = 0; t < SEQ; t++) {
+                    float scores[SEQ], maxs = -1e30f;
+                    for (int t2 = 0; t2 <= t; t2++) {
+                        float s = 0;
+                        for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
+                        s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
+                    }
+                    float sum = 0;
+                    for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
+                    for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum;
+                    for (int d = 0; d < HD; d++) {
+                        float ref = 0;
+                        for (int t2 = 0; t2 <= t; t2++) ref += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
+                        float diff = fabsf((float)out[h*SEQ*HD+t*HD+d] - ref);
+                        if(diff>maxdiff) maxdiff=diff;
+                    }
+                }
+            IOSurfaceUnlock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL);
+            printf("\nDecomposed causal max diff vs CPU ref: %.6f\n", maxdiff);
+        }
+
+        // === Benchmark: SDPA vs decomposed ===
+        printf("\n=== Benchmarks ===\n");
+        int N = 500;
+        {
+            IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+            // Warmup
+            for (int i = 0; i < 10; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa);
+            uint64_t t0 = mach_absolute_time();
+            for (int i = 0; i < N; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa);
+            double ms = tb_ms(mach_absolute_time() - t0);
+            double flops = 4.0 * HEADS * SEQ * SEQ * HD;
+            printf("SDPA (non-causal): %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6);
+        }
+        {
+            // Decomposed: QKT + CPU softmax + SV
+            // Warmup
+            for (int i = 0; i < 10; i++) {
+                IOSurfaceRef ins1[] = {ioQ, ioK};
+                ane_eval(&kQKT, ins1, 2, ioScores);
+                // Skip CPU softmax in benchmark for ANE-only timing
+                IOSurfaceRef ins2[] = {ioScores, ioV};
+                ane_eval(&kSV, ins2, 2, ioOut_decomp);
+            }
+            uint64_t t0 = mach_absolute_time();
+            for (int i = 0; i < N; i++) {
+                IOSurfaceRef ins1[] = {ioQ, ioK};
+                ane_eval(&kQKT, ins1, 2, ioScores);
+                // CPU softmax + causal mask
+                IOSurfaceLock(ioScores, 0, NULL);
+                _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores);
+                float scale = 1.0f / sqrtf((float)HD);
+                for (int h = 0; h < HEADS; h++)
+                    for (int t = 0; t < SEQ; t++) {
+                        float row[SEQ], maxs = -1e30f;
+                        for (int t2 = 0; t2 < SEQ; t2++) {
+                            float s = (float)sc[h*SEQ*SEQ+t*SEQ+t2] * scale;
+                            if (t2 > t) s = -1e30f;
+                            row[t2] = s; if(s>maxs) maxs=s;
+                        }
+                        float sum = 0;
+                        for (int t2 = 0; t2 < SEQ; t2++) { row[t2]=expf(row[t2]-maxs); sum+=row[t2]; }
+                        for (int t2 = 0; t2 < SEQ; t2++)
+                            sc[h*SEQ*SEQ+t*SEQ+t2] = (_Float16)(row[t2]/sum);
+                    }
+                IOSurfaceUnlock(ioScores, 0, NULL);
+                IOSurfaceRef ins2[] = {ioScores, ioV};
+                ane_eval(&kSV, ins2, 2, ioOut_decomp);
+            }
+            double ms = tb_ms(mach_absolute_time() - t0);
+            double flops = 4.0 * HEADS * SEQ * SEQ * HD;
+            printf("Decomposed causal: %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6);
+        }
+
+        CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV);
+        CFRelease(ioScores); CFRelease(ioOut_sdpa); CFRelease(ioOut_decomp);
+        free(Q); free(K); free(V);
+
+        done:
+        cleanup_kern(&kSDPA);
+        cleanup_kern(&kQKT);
+        cleanup_kern(&kSV);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_ane_sdpa5.m b/training/test_ane_sdpa5.m
index 0ddce84..70a6d40 100644
--- a/training/test_ane_sdpa5.m
+++ b/training/test_ane_sdpa5.m
@@ -1,297 +1,304 @@
-// Debug: why causal mask doesn't apply. Try different approaches.
-#import <Foundation/Foundation.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#include <math.h>
-
-#define HEADS 12
-#define HD 64
-#define SEQ 8  // small for readable output
-
-static Class g_D, g_I, g_AR, g_AIO;
-static void ane_init(void) {
-    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_I  = NSClassFromString(@"_ANEInMemoryModel");
-    g_AR = NSClassFromString(@"_ANERequest");
-    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-}
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-// Build inline mask string for MIL: tensor<fp16, [1,1,S,S]>([v00, v01, ...])
-static NSString *build_inline_causal_mask(int s) {
-    NSMutableString *vals = [NSMutableString string];
-    for (int t = 0; t < s; t++) {
-        for (int t2 = 0; t2 < s; t2++) {
-            if (t > 0 || t2 > 0) [vals appendString:@", "];
-            [vals appendString:(t2 <= t) ? @"0" : @"-65504"];  // fp16 -inf
-        }
-    }
-    return [NSString stringWithFormat:
-        @"tensor<fp16, [1, 1, %d, %d]>([%@])", s, s, vals];
-}
-
-static NSData *build_mask_blob(int seq) {
-    int wsize = seq * seq * 2;
-    int total = 128 + wsize;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
-    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
-    _Float16 *fp16 = (_Float16*)(buf+128);
-    for (int t = 0; t < seq; t++)
-        for (int t2 = 0; t2 < seq; t2++)
-            fp16[t*seq + t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-typedef struct { id model; NSString *td; } Model;
-
-static Model compile_model(NSString *mil, NSDictionary *wd) {
-    Model m = {nil, nil};
-    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
-    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil);
-    if (!desc) { printf("  desc=NULL\n"); return m; }
-    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
-    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-    for (NSString *path in wd) {
-        [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
-    }
-    NSError *e = nil;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
-        printf("  compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:"");
-        [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m;
-    }
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
-        printf("  load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m;
-    }
-    m.model = mdl; m.td = td;
-    return m;
-}
-
-static void cleanup_model(Model *m) {
-    if (!m->model) return;
-    NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(m->model, @selector(unloadWithQoS:error:), 21, &e);
-    [[NSFileManager defaultManager] removeItemAtPath:m->td error:nil];
-}
-
-int main() {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        ane_init();
-
-        srand48(42);
-        int total = HEADS * SEQ * HD;
-        _Float16 *Q = (_Float16*)malloc(total * 2);
-        _Float16 *K = (_Float16*)malloc(total * 2);
-        _Float16 *V = (_Float16*)malloc(total * 2);
-        for (int i = 0; i < total; i++) {
-            Q[i] = (_Float16)(0.5f * (2*drand48()-1));
-            K[i] = (_Float16)(0.5f * (2*drand48()-1));
-            V[i] = (_Float16)(0.5f * (2*drand48()-1));
-        }
-
-        size_t bytes = total * 2;
-        IOSurfaceRef ioQ = make_surface(bytes), ioK = make_surface(bytes);
-        IOSurfaceRef ioV = make_surface(bytes);
-        IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, bytes); IOSurfaceUnlock(ioQ, 0, NULL);
-        IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, bytes); IOSurfaceUnlock(ioK, 0, NULL);
-        IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, bytes); IOSurfaceUnlock(ioV, 0, NULL);
-        id wQ = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioQ);
-        id wK = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioK);
-        id wV = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioV);
-
-        // CPU references
-        float scale = 1.0f / sqrtf((float)HD);
-        float *cpu_causal = (float*)calloc(total, sizeof(float));
-        float *cpu_nocausal = (float*)calloc(total, sizeof(float));
-        for (int h = 0; h < HEADS; h++)
-            for (int t = 0; t < SEQ; t++) {
-                // Causal
-                float scores[SEQ], maxs = -1e30f;
-                for (int t2 = 0; t2 <= t; t2++) {
-                    float s = 0;
-                    for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
-                    s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
-                }
-                float sum = 0;
-                for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
-                for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum;
-                for (int d = 0; d < HD; d++) {
-                    float r = 0;
-                    for (int t2 = 0; t2 <= t; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
-                    cpu_causal[h*SEQ*HD+t*HD+d] = r;
-                }
-                // Non-causal
-                maxs = -1e30f;
-                for (int t2 = 0; t2 < SEQ; t2++) {
-                    float s = 0;
-                    for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
-                    s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
-                }
-                sum = 0;
-                for (int t2 = 0; t2 < SEQ; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
-                for (int t2 = 0; t2 < SEQ; t2++) scores[t2]/=sum;
-                for (int d = 0; d < HD; d++) {
-                    float r = 0;
-                    for (int t2 = 0; t2 < SEQ; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
-                    cpu_nocausal[h*SEQ*HD+t*HD+d] = r;
-                }
-            }
-
-        // Helper: eval and compare
-        void (^eval_and_compare)(const char*, Model*, int nInputs, IOSurfaceRef*) =
-            ^(const char *label, Model *m, int nInputs, IOSurfaceRef *inputs) {
-            IOSurfaceRef ioO = make_surface(bytes);
-            id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
-            NSMutableArray *inArr = [NSMutableArray array];
-            NSMutableArray *inIdx = [NSMutableArray array];
-            for (int i = 0; i < nInputs; i++) {
-                [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), inputs[i])];
-                [inIdx addObject:@(i)];
-            }
-            id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-                inArr, inIdx, @[wO], @[@0], nil, nil, @0);
-            NSError *e = nil;
-            BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-                m->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-            if (!ok) {
-                printf("  %s: eval FAIL: %s\n", label, e?[[[e localizedDescription] substringToIndex:MIN(200,(int)[[e localizedDescription] length])] UTF8String]:"");
-                CFRelease(ioO); return;
-            }
-            IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
-            _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioO);
-            float dc=0, dnc=0;
-            for (int i = 0; i < total; i++) {
-                float v = (float)out[i];
-                float d1 = fabsf(v - cpu_causal[i]); if(d1>dc) dc=d1;
-                float d2 = fabsf(v - cpu_nocausal[i]); if(d2>dnc) dnc=d2;
-            }
-            IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
-            printf("  %s: diff_causal=%.6f diff_nocausal=%.6f → %s\n", label, dc, dnc,
-                   dc < dnc ? "CAUSAL" : (dc > dnc ? "NON-CAUSAL" : "SAME"));
-            CFRelease(ioO);
-        };
-
-        // === Test 1: No mask (should be non-causal) ===
-        printf("Test 1: no mask\n");
-        {
-            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
-                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
-                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
-                "    } -> (att);\n}\n",
-                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
-            Model m = compile_model(mil, nil);
-            if (m.model) {
-                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
-                eval_and_compare("no-mask", &m, 3, ins);
-                cleanup_model(&m);
-            }
-        }
-
-        // === Test 2: Inline causal mask ===
-        printf("\nTest 2: inline causal mask\n");
-        {
-            NSString *maskStr = build_inline_causal_mask(SEQ);
-            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
-                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
-                "        %@ mask = const()[name = string(\"mask\"), val = %@];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
-                "    } -> (att);\n}\n",
-                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
-                [NSString stringWithFormat:@"tensor<fp16, [1, 1, %d, %d]>", SEQ, SEQ], maskStr,
-                HEADS, SEQ, HD];
-            Model m = compile_model(mil, nil);
-            if (m.model) {
-                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
-                eval_and_compare("inline-mask", &m, 3, ins);
-                cleanup_model(&m);
-            }
-        }
-
-        // === Test 3: BLOBFILE mask ===
-        printf("\nTest 3: BLOBFILE causal mask\n");
-        {
-            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
-                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
-                "        tensor<fp16, [1, 1, %d, %d]> mask = const()[name = string(\"mask\"), "
-                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
-                "    } -> (att);\n}\n",
-                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
-                SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD];
-            NSDictionary *wd = @{@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_mask_blob(SEQ)}};
-            Model m = compile_model(mil, wd);
-            if (m.model) {
-                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
-                eval_and_compare("blob-mask", &m, 3, ins);
-                cleanup_model(&m);
-            }
-        }
-
-        // === Test 4: mask as runtime input ===
-        printf("\nTest 4: mask as runtime input\n");
-        {
-            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
-                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v, "
-                "tensor<fp16, [1, 1, %d, %d]> mask) {\n"
-                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
-                "    } -> (att);\n}\n",
-                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
-                SEQ, SEQ, HEADS, SEQ, HD];
-            Model m = compile_model(mil, nil);
-            if (m.model) {
-                // Create mask IOSurface
-                size_t mbytes = SEQ * SEQ * 2;
-                IOSurfaceRef ioM = make_surface(mbytes);
-                IOSurfaceLock(ioM, 0, NULL);
-                _Float16 *mp = (_Float16*)IOSurfaceGetBaseAddress(ioM);
-                for (int t = 0; t < SEQ; t++)
-                    for (int t2 = 0; t2 < SEQ; t2++)
-                        mp[t*SEQ+t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
-                IOSurfaceUnlock(ioM, 0, NULL);
-
-                IOSurfaceRef ins[] = {ioQ, ioK, ioV, ioM};
-                eval_and_compare("runtime-mask", &m, 4, ins);
-                CFRelease(ioM);
-                cleanup_model(&m);
-            }
-        }
-
-        CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV);
-        free(Q); free(K); free(V);
-        free(cpu_causal); free(cpu_nocausal);
-        printf("\nDONE\n");
-    }
-    return 0;
-}
+// Debug: why causal mask doesn't apply. Try different approaches.
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#include <math.h>
+#include "ane_compat.h"
+
+#define HEADS 12
+#define HD 64
+#define SEQ 8  // small for readable output
+
+static Class g_D, g_I, g_AR, g_AIO;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+// Build inline mask string for MIL: tensor<fp16, [1,1,S,S]>([v00, v01, ...])
+static NSString *build_inline_causal_mask(int s) {
+    NSMutableString *vals = [NSMutableString string];
+    for (int t = 0; t < s; t++) {
+        for (int t2 = 0; t2 < s; t2++) {
+            if (t > 0 || t2 > 0) [vals appendString:@", "];
+            [vals appendString:(t2 <= t) ? @"0" : @"-65504"];  // fp16 -inf
+        }
+    }
+    return [NSString stringWithFormat:
+        @"tensor<fp16, [1, 1, %d, %d]>([%@])", s, s, vals];
+}
+
+static NSData *build_mask_blob(int seq) {
+    int wsize = seq * seq * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    _Float16 *fp16 = (_Float16*)(buf+128);
+    for (int t = 0; t < seq; t++)
+        for (int t2 = 0; t2 < seq; t2++)
+            fp16[t*seq + t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+typedef struct { id model; NSString *td; } Model;
+
+static Model compile_model(NSString *mil, NSDictionary *wd) {
+    Model m = {nil, nil};
+    NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil);
+    if (!desc) { printf("  desc=NULL\n"); return m; }
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    for (NSString *path in wd) {
+        [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
+    }
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:"");
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m;
+    }
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) {
+        printf("  load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m;
+    }
+    m.model = mdl; m.td = td;
+    return m;
+}
+
+static void cleanup_model(Model *m) {
+    if (!m->model) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(m->model, @selector(unloadWithQoS:error:), 21, &e);
+    [[NSFileManager defaultManager] removeItemAtPath:m->td error:nil];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        ane_detect_platform();
+        ane_print_platform();
+
+        srand48(42);
+        int total = HEADS * SEQ * HD;
+        _Float16 *Q = (_Float16*)malloc(total * 2);
+        _Float16 *K = (_Float16*)malloc(total * 2);
+        _Float16 *V = (_Float16*)malloc(total * 2);
+        for (int i = 0; i < total; i++) {
+            Q[i] = (_Float16)(0.5f * (2*drand48()-1));
+            K[i] = (_Float16)(0.5f * (2*drand48()-1));
+            V[i] = (_Float16)(0.5f * (2*drand48()-1));
+        }
+
+        size_t bytes = total * 2;
+        IOSurfaceRef ioQ = make_surface(bytes), ioK = make_surface(bytes);
+        IOSurfaceRef ioV = make_surface(bytes);
+        IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, bytes); IOSurfaceUnlock(ioQ, 0, NULL);
+        IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, bytes); IOSurfaceUnlock(ioK, 0, NULL);
+        IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, bytes); IOSurfaceUnlock(ioV, 0, NULL);
+        id wQ = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioQ);
+        id wK = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioK);
+        id wV = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioV);
+
+        // CPU references
+        float scale = 1.0f / sqrtf((float)HD);
+        float *cpu_causal = (float*)calloc(total, sizeof(float));
+        float *cpu_nocausal = (float*)calloc(total, sizeof(float));
+        for (int h = 0; h < HEADS; h++)
+            for (int t = 0; t < SEQ; t++) {
+                // Causal
+                float scores[SEQ], maxs = -1e30f;
+                for (int t2 = 0; t2 <= t; t2++) {
+                    float s = 0;
+                    for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
+                    s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
+                }
+                float sum = 0;
+                for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
+                for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum;
+                for (int d = 0; d < HD; d++) {
+                    float r = 0;
+                    for (int t2 = 0; t2 <= t; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
+                    cpu_causal[h*SEQ*HD+t*HD+d] = r;
+                }
+                // Non-causal
+                maxs = -1e30f;
+                for (int t2 = 0; t2 < SEQ; t2++) {
+                    float s = 0;
+                    for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d];
+                    s *= scale; scores[t2] = s; if(s>maxs) maxs=s;
+                }
+                sum = 0;
+                for (int t2 = 0; t2 < SEQ; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; }
+                for (int t2 = 0; t2 < SEQ; t2++) scores[t2]/=sum;
+                for (int d = 0; d < HD; d++) {
+                    float r = 0;
+                    for (int t2 = 0; t2 < SEQ; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d];
+                    cpu_nocausal[h*SEQ*HD+t*HD+d] = r;
+                }
+            }
+
+        // Helper: eval and compare
+        void (^eval_and_compare)(const char*, Model*, int nInputs, IOSurfaceRef*) =
+            ^(const char *label, Model *m, int nInputs, IOSurfaceRef *inputs) {
+            IOSurfaceRef ioO = make_surface(bytes);
+            id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+            NSMutableArray *inArr = [NSMutableArray array];
+            NSMutableArray *inIdx = [NSMutableArray array];
+            for (int i = 0; i < nInputs; i++) {
+                [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), inputs[i])];
+                [inIdx addObject:@(i)];
+            }
+            id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+                inArr, inIdx, @[wO], @[@0], nil, nil, @0);
+            NSError *e = nil;
+            BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                m->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+            if (!ok) {
+                printf("  %s: eval FAIL: %s\n", label, e?[[[e localizedDescription] substringToIndex:MIN(200,(int)[[e localizedDescription] length])] UTF8String]:"");
+                CFRelease(ioO); return;
+            }
+            IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
+            _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioO);
+            float dc=0, dnc=0;
+            for (int i = 0; i < total; i++) {
+                float v = (float)out[i];
+                float d1 = fabsf(v - cpu_causal[i]); if(d1>dc) dc=d1;
+                float d2 = fabsf(v - cpu_nocausal[i]); if(d2>dnc) dnc=d2;
+            }
+            IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
+            printf("  %s: diff_causal=%.6f diff_nocausal=%.6f → %s\n", label, dc, dnc,
+                   dc < dnc ? "CAUSAL" : (dc > dnc ? "NON-CAUSAL" : "SAME"));
+            CFRelease(ioO);
+        };
+
+        // === Test 1: No mask (should be non-causal) ===
+        printf("Test 1: no mask\n");
+        {
+            NSString *mil = [NSString stringWithFormat:
+                @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+                "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"\"}})]\n{\n"
+                "    func main<%s>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                g_ane_platform.mil_program, ane_mil_target(),
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
+            Model m = compile_model(mil, nil);
+            if (m.model) {
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+                eval_and_compare("no-mask", &m, 3, ins);
+                cleanup_model(&m);
+            }
+        }
+
+        // === Test 2: Inline causal mask ===
+        printf("\nTest 2: inline causal mask\n");
+        {
+            NSString *maskStr = build_inline_causal_mask(SEQ);
+            NSString *mil = [NSString stringWithFormat:
+                @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+                "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"\"}})]\n{\n"
+                "    func main<%s>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+                "        %@ mask = const()[name = string(\"mask\"), val = %@];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                g_ane_platform.mil_program, ane_mil_target(),
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
+                [NSString stringWithFormat:@"tensor<fp16, [1, 1, %d, %d]>", SEQ, SEQ], maskStr,
+                HEADS, SEQ, HD];
+            Model m = compile_model(mil, nil);
+            if (m.model) {
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+                eval_and_compare("inline-mask", &m, 3, ins);
+                cleanup_model(&m);
+            }
+        }
+
+        // === Test 3: BLOBFILE mask ===
+        printf("\nTest 3: BLOBFILE causal mask\n");
+        {
+            NSString *mil = [NSString stringWithFormat:
+                @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+                "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"\"}})]\n{\n"
+                "    func main<%s>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
+                "        tensor<fp16, [1, 1, %d, %d]> mask = const()[name = string(\"mask\"), "
+                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                g_ane_platform.mil_program, ane_mil_target(),
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
+                SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD];
+            NSDictionary *wd = @{@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_mask_blob(SEQ)}};
+            Model m = compile_model(mil, wd);
+            if (m.model) {
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV};
+                eval_and_compare("blob-mask", &m, 3, ins);
+                cleanup_model(&m);
+            }
+        }
+
+        // === Test 4: mask as runtime input ===
+        printf("\nTest 4: mask as runtime input\n");
+        {
+            NSString *mil = [NSString stringWithFormat:
+                @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+                "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"\"}})]\n{\n"
+                "    func main<%s>(tensor<fp16, [1, %d, %d, %d]> q, "
+                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v, "
+                "tensor<fp16, [1, 1, %d, %d]> mask) {\n"
+                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
+                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "    } -> (att);\n}\n",
+                g_ane_platform.mil_program, ane_mil_target(),
+                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
+                SEQ, SEQ, HEADS, SEQ, HD];
+            Model m = compile_model(mil, nil);
+            if (m.model) {
+                // Create mask IOSurface
+                size_t mbytes = SEQ * SEQ * 2;
+                IOSurfaceRef ioM = make_surface(mbytes);
+                IOSurfaceLock(ioM, 0, NULL);
+                _Float16 *mp = (_Float16*)IOSurfaceGetBaseAddress(ioM);
+                for (int t = 0; t < SEQ; t++)
+                    for (int t2 = 0; t2 < SEQ; t2++)
+                        mp[t*SEQ+t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f);
+                IOSurfaceUnlock(ioM, 0, NULL);
+
+                IOSurfaceRef ins[] = {ioQ, ioK, ioV, ioM};
+                eval_and_compare("runtime-mask", &m, 4, ins);
+                CFRelease(ioM);
+                cleanup_model(&m);
+            }
+        }
+
+        CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV);
+        free(Q); free(K); free(V);
+        free(cpu_causal); free(cpu_nocausal);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_conv_attn3.m b/training/test_conv_attn3.m
index a396b4d..a52a11b 100644
--- a/training/test_conv_attn3.m
+++ b/training/test_conv_attn3.m
@@ -5,6 +5,7 @@
 #import <IOSurface/IOSurface.h>
 #import <mach/mach_time.h>
 #include <math.h>
+#include "ane_compat.h"
 
 #define HEADS 12
 #define HD 64
@@ -82,10 +83,10 @@ static void cleanup_kern(Kern *k) {
 
 static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) {
     return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+        @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n{\n"
+        "    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
         "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
         "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
         "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
@@ -95,13 +96,15 @@ static void cleanup_kern(Kern *k) {
         "        int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n"
         "        tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
         "pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n"
-        "    } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp];
+        "    } -> (y);\n}\n", g_ane_platform.mil_program, ane_mil_target(), ic, sp, oc, icg, oc, icg, groups, oc, sp];
 }
 
 int main() {
     @autoreleasepool {
         setbuf(stdout, NULL);
         ane_init();
+        ane_detect_platform();
+        ane_print_platform();
         mach_timebase_info(&g_tb);
 
         printf("=== Grouped Conv Causal Attention (layout A) ===\n");
diff --git a/training/test_full_fused.m b/training/test_full_fused.m
index 8449ddb..f9d36d4 100644
--- a/training/test_full_fused.m
+++ b/training/test_full_fused.m
@@ -7,6 +7,7 @@
 #import <IOSurface/IOSurface.h>
 #import <mach/mach_time.h>
 #include <math.h>
+#include "ane_compat.h"
 
 #define DIM 768
 #define HEADS 12
@@ -104,6 +105,8 @@ int main() {
     @autoreleasepool {
         setbuf(stdout, NULL);
         ane_init();
+        ane_detect_platform();
+        ane_print_platform();
         mach_timebase_info(&g_tb);
 
         srand48(42);
@@ -130,10 +133,10 @@ int main() {
             float scale_val = 1.0f / sqrtf((float)HD);
 
             NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+                "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"\"}})]\n{\n"
+                "    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
                 // Conv boilerplate
                 "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
                 "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
@@ -189,6 +192,7 @@ int main() {
                 "        tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr1, pad = pd, "
                 "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n"
                 "    } -> (out);\n}\n",
+                g_ane_platform.mil_program, ane_mil_target(),
                 DIM, SEQ,                              // input
                 DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM,      // Wq, Wk
                 DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM,      // Wv, Wo
@@ -317,10 +321,10 @@ int main() {
         printf("\n=== Test 2: Fused FFN benchmark ===\n");
         {
             NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+                "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+                "{\"coremltools-version\", \"\"}})]\n{\n"
+                "    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
                 "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
                 "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
                 "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
@@ -342,6 +346,7 @@ int main() {
                 "        tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr, pad = pd, "
                 "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n"
                 "    } -> (out);\n}\n",
+                g_ane_platform.mil_program, ane_mil_target(),
                 DIM, SEQ,
                 HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN,
                 HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, DIM,SEQ];
diff --git a/training/test_fused_bwd.m b/training/test_fused_bwd.m
index b91d7b6..9dd6f9b 100644
--- a/training/test_fused_bwd.m
+++ b/training/test_fused_bwd.m
@@ -1,184 +1,188 @@
-// Test: fused backward dx kernels
-// 1. Fused QKV backward: concat(Wq^T@dq, Wk^T@dk, Wv^T@dv) — 3 inputs, 1 output
-//    Problem: 3 separate gradient inputs. Can we concat them as input?
-//    Input: [1, DIM*3, 1, SEQ] = concat(dq, dk, dv)
-//    Use 3 separate convs on slices? MIL has slice_by_size.
-// 2. Fused W1b+W3b: input concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ]
-//    Two convs on slices, add results → [1, DIM, 1, SEQ]
-#import <Foundation/Foundation.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#include <math.h>
-
-#define DIM 768
-#define HIDDEN 2048
-#define SEQ 64
-
-static Class g_D, g_I, g_AR, g_AIO;
-static void ane_init(void) {
-    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_I  = NSClassFromString(@"_ANEInMemoryModel");
-    g_AR = NSClassFromString(@"_ANERequest");
-    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-}
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-static NSData *build_blob_t(const float *w, int rows, int cols) {
-    int wsize = cols * rows * 2, total = 128 + wsize;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
-    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
-    _Float16 *fp16 = (_Float16*)(buf+128);
-    for (int i = 0; i < rows; i++)
-        for (int j = 0; j < cols; j++)
-            fp16[j*rows+i] = (_Float16)w[i*cols+j];
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-int main() {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        ane_init();
-
-        srand48(42);
-        float *W1 = (float*)malloc(HIDDEN*DIM*sizeof(float));
-        float *W3 = (float*)malloc(HIDDEN*DIM*sizeof(float));
-        float sc = 1.0f/sqrtf(HIDDEN);
-        for (int i = 0; i < HIDDEN*DIM; i++) { W1[i]=sc*(2*drand48()-1); W3[i]=sc*(2*drand48()-1); }
-
-        // Test: fused W1b+W3b backward
-        // Input: concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ]
-        // Output: W1^T@dh1 + W3^T@dh3 [1, DIM, 1, SEQ]
-        // MIL: slice input → 2 convs → add
-        printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n");
-
-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"  // [1, HIDDEN*2, 1, SEQ]
-            "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-            // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :]
-            "        tensor<int32, [4]> b1 = const()[name = string(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-            "        tensor<int32, [4]> s1 = const()[name = string(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n"
-            "        tensor<int32, [4]> b3 = const()[name = string(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
-            "        tensor<int32, [4]> s3 = const()[name = string(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n"
-            // Conv: W1^T @ dh1, W3^T @ dh3
-            "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-            "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-            "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-            "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
-            // W1^T: [DIM, HIDDEN, 1, 1]  (transposed from [HIDDEN, DIM])
-            "        tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = string(\"W1t\"), "
-            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n"
-            "        tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = string(\"W3t\"), "
-            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
-            "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
-            "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n"
-            // Add
-            "        tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n"
-            "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-            "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n"
-            "    } -> (y);\n}\n",
-            HIDDEN*2, SEQ, HIDDEN*2, SEQ,
-            HIDDEN, SEQ, HIDDEN, SEQ,  // slice1
-            HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,  // slice3
-            DIM, HIDDEN, DIM, HIDDEN,   // W1t
-            DIM, HIDDEN, DIM, HIDDEN,   // W3t
-            DIM, SEQ, DIM, SEQ,         // dx1, dx3
-            DIM, SEQ, DIM, SEQ];        // sum, y
-
-        NSDictionary *wd = @{
-            @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)},
-            @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3, HIDDEN, DIM)}
-        };
-
-        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
-        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd, nil);
-        if (!desc) { printf("desc=NULL\n"); return 1; }
-        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-        [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
-        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-        for (NSString *path in wd) {
-            [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
-        }
-
-        NSError *e = nil;
-        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
-        printf("Compile: %s\n", ok?"OK":"FAIL");
-        if (!ok) { printf("  %s\n", e?[[e description] UTF8String]:""); return 1; }
-        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-        printf("Load: %s\n", ok?"OK":"FAIL");
-        if (!ok) return 1;
-
-        // Prepare input: concat(dh1, dh3) in channel-first layout
-        float *dh1 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
-        float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
-        for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); }
-
-        IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4);
-        IOSurfaceLock(ioI, 0, NULL);
-        float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
-        // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3
-        for (int t = 0; t < SEQ; t++) {
-            for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
-            for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
-        }
-        IOSurfaceUnlock(ioI, 0, NULL);
-
-        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
-        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
-        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
-
-        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-            mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-        printf("Eval: %s\n", ok?"OK":"FAIL");
-        if (!ok) { printf("  %s\n", e?[[e description] UTF8String]:""); return 1; }
-
-        // CPU reference: dx = W1^T @ dh1 + W3^T @ dh3
-        float *ref = (float*)calloc(SEQ*DIM, sizeof(float));
-        for (int t = 0; t < SEQ; t++)
-            for (int i = 0; i < DIM; i++) {
-                float s = 0;
-                for (int j = 0; j < HIDDEN; j++) {
-                    s += W1[j*DIM+i] * dh1[t*HIDDEN+j]; // W1^T[i,j] = W1[j,i]
-                    s += W3[j*DIM+i] * dh3[t*HIDDEN+j];
-                }
-                ref[t*DIM+i] = s;
-            }
-
-        IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
-        float *src = (float*)IOSurfaceGetBaseAddress(ioO);
-        float maxd = 0;
-        for (int t = 0; t < SEQ; t++)
-            for (int c = 0; c < DIM; c++) {
-                float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
-                if (d > maxd) maxd = d;
-            }
-        IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
-        printf("dx max diff: %.6f\n", maxd);
-
-        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-        [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
-        CFRelease(ioI); CFRelease(ioO);
-        free(W1); free(W3); free(dh1); free(dh3); free(ref);
-        printf("\nDONE\n");
-    }
-    return 0;
-}
+// Test: fused backward dx kernels
+// 1. Fused QKV backward: concat(Wq^T@dq, Wk^T@dk, Wv^T@dv) — 3 inputs, 1 output
+//    Problem: 3 separate gradient inputs. Can we concat them as input?
+//    Input: [1, DIM*3, 1, SEQ] = concat(dq, dk, dv)
+//    Use 3 separate convs on slices? MIL has slice_by_size.
+// 2. Fused W1b+W3b: input concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ]
+//    Two convs on slices, add results → [1, DIM, 1, SEQ]
+#import <Foundation/Foundation.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#include <math.h>
+#include "ane_compat.h"
+
+#define DIM 768
+#define HIDDEN 2048
+#define SEQ 64
+
+static Class g_D, g_I, g_AR, g_AIO;
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+static NSData *build_blob_t(const float *w, int rows, int cols) {
+    int wsize = cols * rows * 2, total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1;
+    *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128;
+    _Float16 *fp16 = (_Float16*)(buf+128);
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            fp16[j*rows+i] = (_Float16)w[i*cols+j];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        ane_detect_platform();
+        ane_print_platform();
+
+        srand48(42);
+        float *W1 = (float*)malloc(HIDDEN*DIM*sizeof(float));
+        float *W3 = (float*)malloc(HIDDEN*DIM*sizeof(float));
+        float sc = 1.0f/sqrtf(HIDDEN);
+        for (int i = 0; i < HIDDEN*DIM; i++) { W1[i]=sc*(2*drand48()-1); W3[i]=sc*(2*drand48()-1); }
+
+        // Test: fused W1b+W3b backward
+        // Input: concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ]
+        // Output: W1^T@dh1 + W3^T@dh3 [1, DIM, 1, SEQ]
+        // MIL: slice input → 2 convs → add
+        printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n");
+
+        NSString *mil = [NSString stringWithFormat:
+            @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+            "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"\"}})]\n{\n"
+            "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"  // [1, HIDDEN*2, 1, SEQ]
+            "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+            // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :]
+            "        tensor<int32, [4]> b1 = const()[name = string(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [4]> s1 = const()[name = string(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n"
+            "        tensor<int32, [4]> b3 = const()[name = string(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
+            "        tensor<int32, [4]> s3 = const()[name = string(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n"
+            // Conv: W1^T @ dh1, W3^T @ dh3
+            "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+            // W1^T: [DIM, HIDDEN, 1, 1]  (transposed from [HIDDEN, DIM])
+            "        tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = string(\"W1t\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = string(\"W3t\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n"
+            // Add
+            "        tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n"
+            "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+            "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n"
+            "    } -> (y);\n}\n",
+            g_ane_platform.mil_program, ane_mil_target(),
+            HIDDEN*2, SEQ, HIDDEN*2, SEQ,
+            HIDDEN, SEQ, HIDDEN, SEQ,  // slice1
+            HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,  // slice3
+            DIM, HIDDEN, DIM, HIDDEN,   // W1t
+            DIM, HIDDEN, DIM, HIDDEN,   // W3t
+            DIM, SEQ, DIM, SEQ,         // dx1, dx3
+            DIM, SEQ, DIM, SEQ];        // sum, y
+
+        NSDictionary *wd = @{
+            @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)},
+            @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3, HIDDEN, DIM)}
+        };
+
+        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd, nil);
+        if (!desc) { printf("desc=NULL\n"); return 1; }
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        for (NSString *path in wd) {
+            [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES];
+        }
+
+        NSError *e = nil;
+        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        printf("Compile: %s\n", ok?"OK":"FAIL");
+        if (!ok) { printf("  %s\n", e?[[e description] UTF8String]:""); return 1; }
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+        printf("Load: %s\n", ok?"OK":"FAIL");
+        if (!ok) return 1;
+
+        // Prepare input: concat(dh1, dh3) in channel-first layout
+        float *dh1 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
+        float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
+        for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); }
+
+        IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4);
+        IOSurfaceLock(ioI, 0, NULL);
+        float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
+        // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3
+        for (int t = 0; t < SEQ; t++) {
+            for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
+            for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
+        }
+        IOSurfaceUnlock(ioI, 0, NULL);
+
+        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
+        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+            mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+        printf("Eval: %s\n", ok?"OK":"FAIL");
+        if (!ok) { printf("  %s\n", e?[[e description] UTF8String]:""); return 1; }
+
+        // CPU reference: dx = W1^T @ dh1 + W3^T @ dh3
+        float *ref = (float*)calloc(SEQ*DIM, sizeof(float));
+        for (int t = 0; t < SEQ; t++)
+            for (int i = 0; i < DIM; i++) {
+                float s = 0;
+                for (int j = 0; j < HIDDEN; j++) {
+                    s += W1[j*DIM+i] * dh1[t*HIDDEN+j]; // W1^T[i,j] = W1[j,i]
+                    s += W3[j*DIM+i] * dh3[t*HIDDEN+j];
+                }
+                ref[t*DIM+i] = s;
+            }
+
+        IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
+        float *src = (float*)IOSurfaceGetBaseAddress(ioO);
+        float maxd = 0;
+        for (int t = 0; t < SEQ; t++)
+            for (int c = 0; c < DIM; c++) {
+                float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
+                if (d > maxd) maxd = d;
+            }
+        IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
+        printf("dx max diff: %.6f\n", maxd);
+
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+        CFRelease(ioI); CFRelease(ioO);
+        free(W1); free(W3); free(dh1); free(dh3); free(ref);
+        printf("\nDONE\n");
+    }
+    return 0;
+}
diff --git a/training/test_fused_qkv.m b/training/test_fused_qkv.m
index 69f41d6..14428f6 100644
--- a/training/test_fused_qkv.m
+++ b/training/test_fused_qkv.m
@@ -8,6 +8,7 @@
 #import <IOSurface/IOSurface.h>
 #import <mach/mach_time.h>
 #include <math.h>
+#include "ane_compat.h"
 
 #define DIM 768
 #define SEQ 64
@@ -86,10 +87,10 @@ static void cleanup_kern(Kern *k) {
 // Fused QKV: 3 convs + concat in one MIL
 static NSString *gen_fused_qkv_mil(void) {
     return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
         "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
         "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
         "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
@@ -115,6 +116,7 @@ static void cleanup_kern(Kern *k) {
         "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
         "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n"
         "    } -> (y);\n}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
         DIM, SEQ, DIM, SEQ,
         DIM, DIM, DIM, DIM,  // Wq
         DIM, DIM, DIM, DIM,  // Wk
@@ -129,10 +131,10 @@ static void cleanup_kern(Kern *k) {
 // Single conv MIL for comparison
 static NSString *gen_single_mil(void) {
     return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
         "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
         "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
         "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
@@ -147,6 +149,7 @@ static void cleanup_kern(Kern *k) {
         "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
         "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
         "    } -> (y);\n}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
         DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ];
 }
 
@@ -154,6 +157,8 @@ int main() {
     @autoreleasepool {
         setbuf(stdout, NULL);
         ane_init();
+        ane_detect_platform();
+        ane_print_platform();
         mach_timebase_info(&g_tb);
 
         printf("=== Fused QKV vs 3x Separate Convs ===\n");
diff --git a/training/test_perf_stats.m b/training/test_perf_stats.m
index cf7b073..d7c1665 100644
--- a/training/test_perf_stats.m
+++ b/training/test_perf_stats.m
@@ -1,233 +1,236 @@
-// test_perf_stats.m — What does _ANEPerformanceStats expose?
-// Probe class methods, properties, instantiate, pass to request, read back.
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-
-static mach_timebase_info_data_t g_tb;
-static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
-
-static void dump_class(const char *name) {
-    Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
-    if (!cls) { printf("  %s: NOT FOUND\n", name); return; }
-    printf("\n=== %s ===\n", name);
-
-    unsigned int count;
-    Method *methods = class_copyMethodList(object_getClass(cls), &count);
-    if (count) printf("  Class methods:\n");
-    for (unsigned int i = 0; i < count; i++) {
-        SEL s = method_getName(methods[i]);
-        const char *enc = method_getTypeEncoding(methods[i]);
-        printf("    + %s  [%s]\n", sel_getName(s), enc ? enc : "?");
-    }
-    free(methods);
-
-    methods = class_copyMethodList(cls, &count);
-    if (count) printf("  Instance methods:\n");
-    for (unsigned int i = 0; i < count; i++) {
-        SEL s = method_getName(methods[i]);
-        const char *enc = method_getTypeEncoding(methods[i]);
-        printf("    - %s  [%s]\n", sel_getName(s), enc ? enc : "?");
-    }
-    free(methods);
-
-    unsigned int pcount;
-    objc_property_t *props = class_copyPropertyList(cls, &pcount);
-    if (pcount) printf("  Properties:\n");
-    for (unsigned int i = 0; i < pcount; i++) {
-        const char *pname = property_getName(props[i]);
-        const char *pattr = property_getAttributes(props[i]);
-        printf("    @property %s  [%s]\n", pname, pattr ? pattr : "?");
-    }
-    free(props);
-}
-
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-int main() {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        mach_timebase_info(&g_tb);
-        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-
-        printf("=== ANE Performance Stats Probe ===\n");
-
-        dump_class("_ANEPerformanceStats");
-        dump_class("_ANEPerfRequest");
-        dump_class("ANEPerfRequest");
-        dump_class("_ANEPerformanceCounters");
-        dump_class("_ANEDeviceInfo");
-        dump_class("_ANEModel");
-        dump_class("_ANEInMemoryModel");
-        dump_class("_ANERequest");
-        dump_class("_ANEIOSurfaceObject");
-        dump_class("_ANEInMemoryModelDescriptor");
-        dump_class("_ANEClient");
-        dump_class("_ANEVirtualClient");
-
-        // Try to instantiate _ANEPerformanceStats
-        printf("\n=== Instantiation Tests ===\n");
-        Class perfClass = NSClassFromString(@"_ANEPerformanceStats");
-        if (perfClass) {
-            @try {
-                id perfStats = [[perfClass alloc] init];
-                printf("_ANEPerformanceStats alloc/init: %s\n",
-                       perfStats ? [[perfStats description] UTF8String] : "nil");
-                if (perfStats) {
-                    unsigned int pcount;
-                    objc_property_t *props = class_copyPropertyList(perfClass, &pcount);
-                    for (unsigned int i = 0; i < pcount; i++) {
-                        const char *pname = property_getName(props[i]);
-                        @try {
-                            id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
-                            printf("  %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
-                        } @catch (NSException *ex) {
-                            printf("  %s = <exception: %s>\n", pname, [[ex reason] UTF8String]);
-                        }
-                    }
-                    free(props);
-                }
-            } @catch (NSException *ex) {
-                printf("Exception: %s\n", [[ex reason] UTF8String]);
-            }
-        }
-
-        // Compile a working kernel and test perfStats in request
-        printf("\n=== Compile kernel and test perfStats in request ===\n");
-        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
-        Class g_AR = NSClassFromString(@"_ANERequest");
-        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-
-        int CH = 64, SP = 32;
-        _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16));
-        for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f;
-        int ws = CH*CH*2, tot = 128+ws;
-        uint8_t *blob = (uint8_t*)calloc(tot,1);
-        blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
-        *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
-        memcpy(blob+128, w, ws);
-        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
-        free(w);
-
-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n"
-            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n"
-            "{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
-            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-            "[name=string(\"conv\")];\n"
-            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-            "    } -> (y);\n"
-            "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
-
-        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
-        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
-            md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil);
-        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-        [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
-            withIntermediateDirectories:YES attributes:nil error:nil];
-        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-        [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
-
-        NSError *e = nil;
-        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
-        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-
-        int ioBytes = CH * SP * 4; // fp32
-        IOSurfaceRef ioIn = make_surface(ioBytes);
-        IOSurfaceRef ioOut = make_surface(ioBytes);
-        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
-        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
-
-        // Try creating request WITH perfStats
-        if (perfClass) {
-            id perfStats = [[perfClass alloc] init];
-            printf("  Creating request with perfStats=%s\n", perfStats ? "non-nil" : "nil");
-
-            id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-                @[wI], @[@0], @[wO], @[@0], nil, perfStats, @0);
-            printf("  Request: %s\n", req ? "created" : "nil");
-
-            if (req) {
-                IOSurfaceLock(ioIn, 0, NULL);
-                float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-                for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
-                IOSurfaceUnlock(ioIn, 0, NULL);
-
-                BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-                    mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-                printf("  Eval: %s\n", ok ? "OK" : [[e description] UTF8String]);
-
-                if (ok && perfStats) {
-                    printf("\n  PerfStats after 1 eval:\n");
-                    unsigned int pcount;
-                    objc_property_t *props = class_copyPropertyList(perfClass, &pcount);
-                    for (unsigned int i = 0; i < pcount; i++) {
-                        const char *pname = property_getName(props[i]);
-                        @try {
-                            id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
-                            printf("    %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
-                        } @catch (NSException *ex) {
-                            printf("    %s = <exception>\n", pname);
-                        }
-                    }
-                    free(props);
-
-                    printf("\n  Running 100 evals...\n");
-                    uint64_t t0 = mach_absolute_time();
-                    for (int i = 0; i < 100; i++) {
-                        ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-                            mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-                    }
-                    printf("  100 evals in %.1fms (%.2fms/eval)\n",
-                           tb_ms(mach_absolute_time()-t0), tb_ms(mach_absolute_time()-t0)/100.0);
-
-                    printf("\n  PerfStats after 101 evals:\n");
-                    props = class_copyPropertyList(perfClass, &pcount);
-                    for (unsigned int i = 0; i < pcount; i++) {
-                        const char *pname = property_getName(props[i]);
-                        @try {
-                            id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
-                            printf("    %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
-                        } @catch (NSException *ex) {
-                            printf("    %s = <exception>\n", pname);
-                        }
-                    }
-                    free(props);
-                }
-            }
-        } else {
-            printf("  _ANEPerformanceStats class NOT FOUND\n");
-        }
-
-        // Cleanup
-        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-        [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
-        CFRelease(ioIn); CFRelease(ioOut);
-    }
-    return 0;
-}
+// test_perf_stats.m — What does _ANEPerformanceStats expose?
+// Probe class methods, properties, instantiate, pass to request, read back.
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include "ane_compat.h"
+
+static mach_timebase_info_data_t g_tb;
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+static void dump_class(const char *name) {
+    Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
+    if (!cls) { printf("  %s: NOT FOUND\n", name); return; }
+    printf("\n=== %s ===\n", name);
+
+    unsigned int count;
+    Method *methods = class_copyMethodList(object_getClass(cls), &count);
+    if (count) printf("  Class methods:\n");
+    for (unsigned int i = 0; i < count; i++) {
+        SEL s = method_getName(methods[i]);
+        const char *enc = method_getTypeEncoding(methods[i]);
+        printf("    + %s  [%s]\n", sel_getName(s), enc ? enc : "?");
+    }
+    free(methods);
+
+    methods = class_copyMethodList(cls, &count);
+    if (count) printf("  Instance methods:\n");
+    for (unsigned int i = 0; i < count; i++) {
+        SEL s = method_getName(methods[i]);
+        const char *enc = method_getTypeEncoding(methods[i]);
+        printf("    - %s  [%s]\n", sel_getName(s), enc ? enc : "?");
+    }
+    free(methods);
+
+    unsigned int pcount;
+    objc_property_t *props = class_copyPropertyList(cls, &pcount);
+    if (pcount) printf("  Properties:\n");
+    for (unsigned int i = 0; i < pcount; i++) {
+        const char *pname = property_getName(props[i]);
+        const char *pattr = property_getAttributes(props[i]);
+        printf("    @property %s  [%s]\n", pname, pattr ? pattr : "?");
+    }
+    free(props);
+}
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        mach_timebase_info(&g_tb);
+        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+        ane_detect_platform();
+        ane_print_platform();
+
+        printf("=== ANE Performance Stats Probe ===\n");
+
+        dump_class("_ANEPerformanceStats");
+        dump_class("_ANEPerfRequest");
+        dump_class("ANEPerfRequest");
+        dump_class("_ANEPerformanceCounters");
+        dump_class("_ANEDeviceInfo");
+        dump_class("_ANEModel");
+        dump_class("_ANEInMemoryModel");
+        dump_class("_ANERequest");
+        dump_class("_ANEIOSurfaceObject");
+        dump_class("_ANEInMemoryModelDescriptor");
+        dump_class("_ANEClient");
+        dump_class("_ANEVirtualClient");
+
+        // Try to instantiate _ANEPerformanceStats
+        printf("\n=== Instantiation Tests ===\n");
+        Class perfClass = NSClassFromString(@"_ANEPerformanceStats");
+        if (perfClass) {
+            @try {
+                id perfStats = [[perfClass alloc] init];
+                printf("_ANEPerformanceStats alloc/init: %s\n",
+                       perfStats ? [[perfStats description] UTF8String] : "nil");
+                if (perfStats) {
+                    unsigned int pcount;
+                    objc_property_t *props = class_copyPropertyList(perfClass, &pcount);
+                    for (unsigned int i = 0; i < pcount; i++) {
+                        const char *pname = property_getName(props[i]);
+                        @try {
+                            id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
+                            printf("  %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
+                        } @catch (NSException *ex) {
+                            printf("  %s = <exception: %s>\n", pname, [[ex reason] UTF8String]);
+                        }
+                    }
+                    free(props);
+                }
+            } @catch (NSException *ex) {
+                printf("Exception: %s\n", [[ex reason] UTF8String]);
+            }
+        }
+
+        // Compile a working kernel and test perfStats in request
+        printf("\n=== Compile kernel and test perfStats in request ===\n");
+        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
+        Class g_AR = NSClassFromString(@"_ANERequest");
+        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+
+        int CH = 64, SP = 32;
+        _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16));
+        for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f;
+        int ws = CH*CH*2, tot = 128+ws;
+        uint8_t *blob = (uint8_t*)calloc(tot,1);
+        blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
+        *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
+        memcpy(blob+128, w, ws);
+        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
+        free(w);
+
+        NSString *mil = [NSString stringWithFormat:
+            @"program(%s)\n"
+            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+            "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"\"}})]\n"
+            "{\n"
+            "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
+            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
+            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
+            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
+            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
+            "[name=string(\"conv\")];\n"
+            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
+            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
+            "    } -> (y);\n"
+            "}\n", g_ane_platform.mil_program, ane_mil_target(), CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+
+        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
+            md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil);
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+            withIntermediateDirectories:YES attributes:nil error:nil];
+        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+
+        NSError *e = nil;
+        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+
+        int ioBytes = CH * SP * 4; // fp32
+        IOSurfaceRef ioIn = make_surface(ioBytes);
+        IOSurfaceRef ioOut = make_surface(ioBytes);
+        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
+        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
+
+        // Try creating request WITH perfStats
+        if (perfClass) {
+            id perfStats = [[perfClass alloc] init];
+            printf("  Creating request with perfStats=%s\n", perfStats ? "non-nil" : "nil");
+
+            id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+                @[wI], @[@0], @[wO], @[@0], nil, perfStats, @0);
+            printf("  Request: %s\n", req ? "created" : "nil");
+
+            if (req) {
+                IOSurfaceLock(ioIn, 0, NULL);
+                float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+                for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
+                IOSurfaceUnlock(ioIn, 0, NULL);
+
+                BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                    mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+                printf("  Eval: %s\n", ok ? "OK" : [[e description] UTF8String]);
+
+                if (ok && perfStats) {
+                    printf("\n  PerfStats after 1 eval:\n");
+                    unsigned int pcount;
+                    objc_property_t *props = class_copyPropertyList(perfClass, &pcount);
+                    for (unsigned int i = 0; i < pcount; i++) {
+                        const char *pname = property_getName(props[i]);
+                        @try {
+                            id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
+                            printf("    %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
+                        } @catch (NSException *ex) {
+                            printf("    %s = <exception>\n", pname);
+                        }
+                    }
+                    free(props);
+
+                    printf("\n  Running 100 evals...\n");
+                    uint64_t t0 = mach_absolute_time();
+                    for (int i = 0; i < 100; i++) {
+                        ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                            mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+                    }
+                    printf("  100 evals in %.1fms (%.2fms/eval)\n",
+                           tb_ms(mach_absolute_time()-t0), tb_ms(mach_absolute_time()-t0)/100.0);
+
+                    printf("\n  PerfStats after 101 evals:\n");
+                    props = class_copyPropertyList(perfClass, &pcount);
+                    for (unsigned int i = 0; i < pcount; i++) {
+                        const char *pname = property_getName(props[i]);
+                        @try {
+                            id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]];
+                            printf("    %s = %s\n", pname, val ? [[val description] UTF8String] : "nil");
+                        } @catch (NSException *ex) {
+                            printf("    %s = <exception>\n", pname);
+                        }
+                    }
+                    free(props);
+                }
+            }
+        } else {
+            printf("  _ANEPerformanceStats class NOT FOUND\n");
+        }
+
+        // Cleanup
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+        CFRelease(ioIn); CFRelease(ioOut);
+    }
+    return 0;
+}
diff --git a/training/test_qos_sweep.m b/training/test_qos_sweep.m
index 2802c6b..c0dd7d2 100644
--- a/training/test_qos_sweep.m
+++ b/training/test_qos_sweep.m
@@ -1,157 +1,160 @@
-// test_qos_sweep.m — Does QoS affect frequency/latency?
-// Sweep QoS 0-63 on compile, load, eval of a working kernel.
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-
-static mach_timebase_info_data_t g_tb;
-static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
-
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-int main() {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        mach_timebase_info(&g_tb);
-        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-
-        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
-        Class g_AR = NSClassFromString(@"_ANERequest");
-        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-
-        // 256x256 conv, spatial=64 for measurable latency
-        int CH = 256, SP = 64;
-        int ws = CH*CH*2, tot = 128+ws;
-        uint8_t *blob = (uint8_t*)calloc(tot, 1);
-        blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
-        *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
-        _Float16 *wp = (_Float16*)(blob+128);
-        for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50));
-        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
-
-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n"
-            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n"
-            "{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
-            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-            "[name=string(\"conv\")];\n"
-            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-            "    } -> (y);\n"
-            "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
-
-        NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}};
-        NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
-        NSFileManager *fm = [NSFileManager defaultManager];
-
-        printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n");
-        printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6);
-        printf("%4s %10s %10s %10s %10s  %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status");
-
-        unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63};
-        int n_qos = sizeof(qos_values)/sizeof(qos_values[0]);
-
-        for (int qi = 0; qi < n_qos; qi++) {
-            unsigned int qos = qos_values[qi];
-            NSError *e = nil;
-
-            // Make unique weights per iteration so hex differs
-            _Float16 *wq = (_Float16*)(blob+128);
-            wq[0] = (_Float16)(0.001f * qi);
-            NSData *wdata_q = [NSData dataWithBytes:blob length:tot];
-            NSDictionary *weights_q = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata_q}};
-
-            id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
-                milData, weights_q, nil);
-            id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-            id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-            NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-            [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
-                withIntermediateDirectories:YES attributes:nil error:nil];
-            [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-            [wdata_q writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
-
-            uint64_t t0 = mach_absolute_time();
-            BOOL cok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
-                mdl, @selector(compileWithQoS:options:error:), qos, @{}, &e);
-            double cms = tb_ms(mach_absolute_time() - t0);
-
-            if (!cok) {
-                printf("%4u %10s %10s %10s %10s  COMPILE_FAIL\n", qos, "-", "-", "-", "-");
-                [fm removeItemAtPath:td error:nil];
-                continue;
-            }
-
-            t0 = mach_absolute_time();
-            BOOL lok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
-                mdl, @selector(loadWithQoS:options:error:), qos, @{}, &e);
-            double lms = tb_ms(mach_absolute_time() - t0);
-
-            if (!lok) {
-                printf("%4u %8.1fms %10s %10s %10s  LOAD_FAIL\n", qos, cms, "-", "-", "-");
-                ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-                [fm removeItemAtPath:td error:nil];
-                continue;
-            }
-
-            int ioBytes = CH * SP * 4;
-            IOSurfaceRef ioIn = make_surface(ioBytes);
-            IOSurfaceRef ioOut = make_surface(ioBytes);
-            id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
-            id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
-            id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-                @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
-
-            IOSurfaceLock(ioIn, 0, NULL);
-            float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-            for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
-            IOSurfaceUnlock(ioIn, 0, NULL);
-
-            t0 = mach_absolute_time();
-            BOOL eok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-                mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e);
-            double ems1 = tb_ms(mach_absolute_time() - t0);
-
-            if (!eok) {
-                printf("%4u %8.1fms %8.1fms %10s %10s  EVAL_FAIL\n", qos, cms, lms, "-", "-");
-            } else {
-                t0 = mach_absolute_time();
-                for (int i = 0; i < 10; i++) {
-                    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-                        mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e);
-                }
-                double ems_avg = tb_ms(mach_absolute_time() - t0) / 10.0;
-                printf("%4u %8.1fms %8.1fms %8.2fms %8.2fms  OK\n", qos, cms, lms, ems1, ems_avg);
-            }
-
-            ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-            CFRelease(ioIn); CFRelease(ioOut);
-            [fm removeItemAtPath:td error:nil];
-        }
-
-        printf("\nDone.\n");
-    }
-    return 0;
-}
+// test_qos_sweep.m — Does QoS affect frequency/latency?
+// Sweep QoS 0-63 on compile, load, eval of a working kernel.
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include "ane_compat.h"
+
+static mach_timebase_info_data_t g_tb;
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        mach_timebase_info(&g_tb);
+        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+        ane_detect_platform();
+        ane_print_platform();
+
+        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
+        Class g_AR = NSClassFromString(@"_ANERequest");
+        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+
+        // 256x256 conv, spatial=64 for measurable latency
+        int CH = 256, SP = 64;
+        int ws = CH*CH*2, tot = 128+ws;
+        uint8_t *blob = (uint8_t*)calloc(tot, 1);
+        blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1;
+        *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128;
+        _Float16 *wp = (_Float16*)(blob+128);
+        for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50));
+        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
+
+        NSString *mil = [NSString stringWithFormat:
+            @"program(%s)\n"
+            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+            "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+            "{\"coremltools-version\", \"\"}})]\n"
+            "{\n"
+            "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
+            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
+            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
+            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
+            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
+            "[name=string(\"conv\")];\n"
+            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
+            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
+            "    } -> (y);\n"
+            "}\n", g_ane_platform.mil_program, ane_mil_target(), CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+
+        NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}};
+        NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
+        NSFileManager *fm = [NSFileManager defaultManager];
+
+        printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n");
+        printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6);
+        printf("%4s %10s %10s %10s %10s  %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status");
+
+        unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63};
+        int n_qos = sizeof(qos_values)/sizeof(qos_values[0]);
+
+        for (int qi = 0; qi < n_qos; qi++) {
+            unsigned int qos = qos_values[qi];
+            NSError *e = nil;
+
+            // Make unique weights per iteration so hex differs
+            _Float16 *wq = (_Float16*)(blob+128);
+            wq[0] = (_Float16)(0.001f * qi);
+            NSData *wdata_q = [NSData dataWithBytes:blob length:tot];
+            NSDictionary *weights_q = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata_q}};
+
+            id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
+                milData, weights_q, nil);
+            id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+            id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+            NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+            [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
+                withIntermediateDirectories:YES attributes:nil error:nil];
+            [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+            [wdata_q writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+
+            uint64_t t0 = mach_absolute_time();
+            BOOL cok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+                mdl, @selector(compileWithQoS:options:error:), qos, @{}, &e);
+            double cms = tb_ms(mach_absolute_time() - t0);
+
+            if (!cok) {
+                printf("%4u %10s %10s %10s %10s  COMPILE_FAIL\n", qos, "-", "-", "-", "-");
+                [fm removeItemAtPath:td error:nil];
+                continue;
+            }
+
+            t0 = mach_absolute_time();
+            BOOL lok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(
+                mdl, @selector(loadWithQoS:options:error:), qos, @{}, &e);
+            double lms = tb_ms(mach_absolute_time() - t0);
+
+            if (!lok) {
+                printf("%4u %8.1fms %10s %10s %10s  LOAD_FAIL\n", qos, cms, "-", "-", "-");
+                ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+                [fm removeItemAtPath:td error:nil];
+                continue;
+            }
+
+            int ioBytes = CH * SP * 4;
+            IOSurfaceRef ioIn = make_surface(ioBytes);
+            IOSurfaceRef ioOut = make_surface(ioBytes);
+            id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
+            id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
+            id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+                @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+                @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+
+            IOSurfaceLock(ioIn, 0, NULL);
+            float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+            for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
+            IOSurfaceUnlock(ioIn, 0, NULL);
+
+            t0 = mach_absolute_time();
+            BOOL eok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e);
+            double ems1 = tb_ms(mach_absolute_time() - t0);
+
+            if (!eok) {
+                printf("%4u %8.1fms %8.1fms %10s %10s  EVAL_FAIL\n", qos, cms, lms, "-", "-");
+            } else {
+                t0 = mach_absolute_time();
+                for (int i = 0; i < 10; i++) {
+                    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+                        mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e);
+                }
+                double ems_avg = tb_ms(mach_absolute_time() - t0) / 10.0;
+                printf("%4u %8.1fms %8.1fms %8.2fms %8.2fms  OK\n", qos, cms, lms, ems1, ems_avg);
+            }
+
+            ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+            CFRelease(ioIn); CFRelease(ioOut);
+            [fm removeItemAtPath:td error:nil];
+        }
+
+        printf("\nDone.\n");
+    }
+    return 0;
+}
diff --git a/training/test_weight_reload.m b/training/test_weight_reload.m
index a248005..cb3fca1 100644
--- a/training/test_weight_reload.m
+++ b/training/test_weight_reload.m
@@ -1,253 +1,256 @@
-// test_weight_reload.m — Can we skip recompilation by rewriting weight blobs on disk?
-// Compile a conv kernel with weights A, eval, verify output.
-// Overwrite weights/weight.bin in tmpDir with weights B.
-// unloadWithQoS: then loadWithQoS: (no recompile).
-// Eval again — if output matches B @ x, compilation bottleneck is eliminated.
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-#include <math.h>
-
-static mach_timebase_info_data_t g_tb;
-static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
-
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-// Build weight blob matching inmem_peak format (single chunk)
-static NSData *build_weight_blob(_Float16 *w, int rows, int cols) {
-    int ws = rows * cols * 2;
-    int tot = 128 + ws;
-    uint8_t *b = (uint8_t*)calloc(tot, 1);
-    b[0] = 1; b[4] = 2;
-    b[64] = 0xEF; b[65] = 0xBE; b[66] = 0xAD; b[67] = 0xDE; b[68] = 1;
-    *(uint32_t*)(b+72) = ws;
-    *(uint32_t*)(b+80) = 128;
-    memcpy(b + 128, w, ws);
-    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
-}
-
-// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out
-static NSString *gen_mil(int ch, int sp) {
-    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
-        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-        "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-        "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-        "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-        "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-        "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-        "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-        "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
-        "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-        "[name=string(\"conv\")];\n"
-        "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-        "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-        "    } -> (y);\n"
-        "}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
-}
-
-int main() {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        mach_timebase_info(&g_tb);
-        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-
-        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
-        Class g_AR = NSClassFromString(@"_ANERequest");
-        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-
-        if (!g_D || !g_I || !g_AR || !g_AIO) {
-            printf("FAIL: ANE classes not found\n");
-            return 1;
-        }
-
-        // Use 64-channel conv, spatial=32 (known to work on ANE)
-        int CH = 64, SP = 32;
-
-        // Weight set A: scaled identity (1.0 on diagonal)
-        _Float16 *weightsA = (_Float16*)calloc(CH*CH, sizeof(_Float16));
-        for (int i = 0; i < CH; i++) weightsA[i*CH+i] = (_Float16)1.0f;
-
-        // Weight set B: 3x identity
-        _Float16 *weightsB = (_Float16*)calloc(CH*CH, sizeof(_Float16));
-        for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f;
-
-        NSData *wdataA = build_weight_blob(weightsA, CH, CH);
-        NSString *mil = gen_mil(CH, SP);
-        NSDictionary *weights = @{
-            @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA}
-        };
-        NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
-
-        // === Compile with weights A ===
-        printf("=== Step 1: Compile with weights A (identity) ===\n");
-        printf("  Kernel: %dx%d conv, spatial=%d\n", CH, CH, SP);
-        uint64_t t0 = mach_absolute_time();
-        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, weights, nil);
-        if (!desc) { printf("FAIL: desc=NULL\n"); return 1; }
-        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-        NSFileManager *fm = [NSFileManager defaultManager];
-        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
-        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-        [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
-
-        NSError *e = nil;
-        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
-        if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; }
-        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-        if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; }
-        double compile_ms = tb_ms(mach_absolute_time() - t0);
-        printf("  Compile+load: %.1fms\n", compile_ms);
-        printf("  tmpDir: %s\n", [td UTF8String]);
-
-        // Build request and IOSurfaces (fp32 I/O)
-        int inBytes = CH * SP * 4;  // fp32
-        int outBytes = CH * SP * 4;
-        IOSurfaceRef ioIn = make_surface(inBytes);
-        IOSurfaceRef ioOut = make_surface(outBytes);
-        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
-        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
-        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
-
-        // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01
-        IOSurfaceLock(ioIn, 0, NULL);
-        float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-        for (int c = 0; c < CH; c++)
-            for (int s = 0; s < SP; s++)
-                inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
-        IOSurfaceUnlock(ioIn, 0, NULL);
-
-        // Eval with weights A
-        printf("\n=== Step 2: Eval with weights A ===\n");
-        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-        if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
-
-        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-        float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
-        printf("  Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]);
-        printf("  Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
-               outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]);
-        // Save copy
-        float *outA_copy = (float*)malloc(outBytes);
-        memcpy(outA_copy, outA, outBytes);
-        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
-
-        // === Step 3: Overwrite weight file with B, unload+load ===
-        printf("\n=== Step 3: Overwrite weight.bin with B (3x identity), unload+load ===\n");
-        NSData *wdataB = build_weight_blob(weightsB, CH, CH);
-        NSString *weightPath = [td stringByAppendingPathComponent:@"weights/weight.bin"];
-        [wdataB writeToFile:weightPath atomically:YES];
-        printf("  Wrote new weight.bin\n");
-
-        // Unload
-        t0 = mach_absolute_time();
-        ok = ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-        double unload_ms = tb_ms(mach_absolute_time() - t0);
-        printf("  Unload: %s (%.2fms)\n", ok ? "OK" : "FAIL", unload_ms);
-
-        // Reload (no compile!)
-        t0 = mach_absolute_time();
-        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-        double reload_ms = tb_ms(mach_absolute_time() - t0);
-        printf("  Load (no recompile): %s (%.2fms)\n", ok ? "OK" : [[e description] UTF8String], reload_ms);
-
-        if (!ok) {
-            printf("\n*** Load-after-overwrite FAILED — trying recompile+load ***\n");
-            t0 = mach_absolute_time();
-            ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
-            printf("  Re-compile: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0));
-            t0 = mach_absolute_time();
-            ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
-            printf("  Re-load: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0));
-        }
-
-        // Build new request (re-use same surfaces)
-        wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
-        wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
-        req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
-
-        // Re-write same input
-        IOSurfaceLock(ioIn, 0, NULL);
-        inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-        for (int c = 0; c < CH; c++)
-            for (int s = 0; s < SP; s++)
-                inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
-        IOSurfaceUnlock(ioIn, 0, NULL);
-
-        // Eval with (possibly reloaded) weights B
-        printf("\n=== Step 4: Eval after reload ===\n");
-        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-        if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
-
-        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-        float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
-        printf("  Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]);
-        printf("  Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
-               outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]);
-
-        // Check: did the output change?
-        bool changed = false;
-        float max_diff = 0;
-        for (int i = 0; i < CH*SP; i++) {
-            float d = fabsf(outB[i] - outA_copy[i]);
-            if (d > max_diff) max_diff = d;
-            if (d > 0.001f) changed = true;
-        }
-        // Expected: output B should be 3x output A
-        bool correct_3x = true;
-        float max_3x_err = 0;
-        for (int i = 0; i < CH*SP; i++) {
-            float expected = outA_copy[i] * 3.0f;
-            float err = fabsf(outB[i] - expected);
-            if (err > max_3x_err) max_3x_err = err;
-            if (err > 0.1f) correct_3x = false;
-        }
-        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
-
-        printf("\n=== RESULT ===\n");
-        printf("  Max A-B diff: %.6f\n", max_diff);
-        printf("  Max 3x error: %.6f\n", max_3x_err);
-        printf("  Compile+load: %.1fms | Unload: %.1fms | Reload: %.1fms\n", compile_ms, unload_ms, reload_ms);
-
-        if (changed && correct_3x) {
-            printf("\nSUCCESS: Weight reload works! Output matches 3x identity.\n");
-            printf("  Speedup: compile=%.1fms vs reload=%.1fms (%.1fx faster)\n",
-                   compile_ms, unload_ms + reload_ms, compile_ms / (unload_ms + reload_ms));
-            printf(">>> Compilation bottleneck can be eliminated <<<\n");
-        } else if (changed && !correct_3x) {
-            printf("\nPARTIAL: Output changed but doesn't match expected 3x.\n");
-        } else {
-            printf("\nFAIL: Output did NOT change. Weight reload does not work.\n");
-            printf("  ANE cached the compiled model — weights baked at compile time.\n");
-            printf(">>> Need alternative: weightsBuffer IOSurface or async recompile <<<\n");
-        }
-
-        // Cleanup
-        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-        [fm removeItemAtPath:td error:nil];
-        CFRelease(ioIn); CFRelease(ioOut);
-        free(outA_copy); free(weightsA); free(weightsB);
-    }
-    return 0;
-}
+// test_weight_reload.m — Can we skip recompilation by rewriting weight blobs on disk?
+// Compile a conv kernel with weights A, eval, verify output.
+// Overwrite weights/weight.bin in tmpDir with weights B.
+// unloadWithQoS: then loadWithQoS: (no recompile).
+// Eval again — if output matches B @ x, compilation bottleneck is eliminated.
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+#include "ane_compat.h"
+
+static mach_timebase_info_data_t g_tb;
+static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+// Build weight blob matching inmem_peak format (single chunk)
+static NSData *build_weight_blob(_Float16 *w, int rows, int cols) {
+    int ws = rows * cols * 2;
+    int tot = 128 + ws;
+    uint8_t *b = (uint8_t*)calloc(tot, 1);
+    b[0] = 1; b[4] = 2;
+    b[64] = 0xEF; b[65] = 0xBE; b[66] = 0xAD; b[67] = 0xDE; b[68] = 1;
+    *(uint32_t*)(b+72) = ws;
+    *(uint32_t*)(b+80) = 128;
+    memcpy(b + 128, w, ws);
+    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
+}
+
+// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out
+static NSString *gen_mil(int ch, int sp) {
+    return [NSString stringWithFormat:
+        @"program(%s)\n"
+        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n"
+        "{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+        "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+        "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+        "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+        "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
+        "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
+        "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
+        "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
+        "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
+        "[name=string(\"conv\")];\n"
+        "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
+        "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
+        "    } -> (y);\n"
+        "}\n", g_ane_platform.mil_program, ane_mil_target(), ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
+}
+
+int main() {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        mach_timebase_info(&g_tb);
+        dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+        ane_detect_platform();
+        ane_print_platform();
+
+        Class g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+        Class g_I  = NSClassFromString(@"_ANEInMemoryModel");
+        Class g_AR = NSClassFromString(@"_ANERequest");
+        Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+
+        if (!g_D || !g_I || !g_AR || !g_AIO) {
+            printf("FAIL: ANE classes not found\n");
+            return 1;
+        }
+
+        // Use 64-channel conv, spatial=32 (known to work on ANE)
+        int CH = 64, SP = 32;
+
+        // Weight set A: scaled identity (1.0 on diagonal)
+        _Float16 *weightsA = (_Float16*)calloc(CH*CH, sizeof(_Float16));
+        for (int i = 0; i < CH; i++) weightsA[i*CH+i] = (_Float16)1.0f;
+
+        // Weight set B: 3x identity
+        _Float16 *weightsB = (_Float16*)calloc(CH*CH, sizeof(_Float16));
+        for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f;
+
+        NSData *wdataA = build_weight_blob(weightsA, CH, CH);
+        NSString *mil = gen_mil(CH, SP);
+        NSDictionary *weights = @{
+            @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA}
+        };
+        NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
+
+        // === Compile with weights A ===
+        printf("=== Step 1: Compile with weights A (identity) ===\n");
+        printf("  Kernel: %dx%d conv, spatial=%d\n", CH, CH, SP);
+        uint64_t t0 = mach_absolute_time();
+        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, weights, nil);
+        if (!desc) { printf("FAIL: desc=NULL\n"); return 1; }
+        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+        NSFileManager *fm = [NSFileManager defaultManager];
+        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+        [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+
+        NSError *e = nil;
+        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; }
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+        if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; }
+        double compile_ms = tb_ms(mach_absolute_time() - t0);
+        printf("  Compile+load: %.1fms\n", compile_ms);
+        printf("  tmpDir: %s\n", [td UTF8String]);
+
+        // Build request and IOSurfaces (fp32 I/O)
+        int inBytes = CH * SP * 4;  // fp32
+        int outBytes = CH * SP * 4;
+        IOSurfaceRef ioIn = make_surface(inBytes);
+        IOSurfaceRef ioOut = make_surface(outBytes);
+        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
+        id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
+        id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+
+        // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01
+        IOSurfaceLock(ioIn, 0, NULL);
+        float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+        for (int c = 0; c < CH; c++)
+            for (int s = 0; s < SP; s++)
+                inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
+        IOSurfaceUnlock(ioIn, 0, NULL);
+
+        // Eval with weights A
+        printf("\n=== Step 2: Eval with weights A ===\n");
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+        if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
+
+        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
+        float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
+        printf("  Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]);
+        printf("  Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
+               outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]);
+        // Save copy
+        float *outA_copy = (float*)malloc(outBytes);
+        memcpy(outA_copy, outA, outBytes);
+        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
+
+        // === Step 3: Overwrite weight file with B, unload+load ===
+        printf("\n=== Step 3: Overwrite weight.bin with B (3x identity), unload+load ===\n");
+        NSData *wdataB = build_weight_blob(weightsB, CH, CH);
+        NSString *weightPath = [td stringByAppendingPathComponent:@"weights/weight.bin"];
+        [wdataB writeToFile:weightPath atomically:YES];
+        printf("  Wrote new weight.bin\n");
+
+        // Unload
+        t0 = mach_absolute_time();
+        ok = ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+        double unload_ms = tb_ms(mach_absolute_time() - t0);
+        printf("  Unload: %s (%.2fms)\n", ok ? "OK" : "FAIL", unload_ms);
+
+        // Reload (no compile!)
+        t0 = mach_absolute_time();
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+        double reload_ms = tb_ms(mach_absolute_time() - t0);
+        printf("  Load (no recompile): %s (%.2fms)\n", ok ? "OK" : [[e description] UTF8String], reload_ms);
+
+        if (!ok) {
+            printf("\n*** Load-after-overwrite FAILED — trying recompile+load ***\n");
+            t0 = mach_absolute_time();
+            ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+            printf("  Re-compile: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0));
+            t0 = mach_absolute_time();
+            ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
+            printf("  Re-load: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0));
+        }
+
+        // Build new request (re-use same surfaces)
+        wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
+        wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut);
+        req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+            @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+            @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+
+        // Re-write same input
+        IOSurfaceLock(ioIn, 0, NULL);
+        inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+        for (int c = 0; c < CH; c++)
+            for (int s = 0; s < SP; s++)
+                inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
+        IOSurfaceUnlock(ioIn, 0, NULL);
+
+        // Eval with (possibly reloaded) weights B
+        printf("\n=== Step 4: Eval after reload ===\n");
+        ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+        if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }
+
+        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
+        float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
+        printf("  Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]);
+        printf("  Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
+               outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]);
+
+        // Check: did the output change?
+        bool changed = false;
+        float max_diff = 0;
+        for (int i = 0; i < CH*SP; i++) {
+            float d = fabsf(outB[i] - outA_copy[i]);
+            if (d > max_diff) max_diff = d;
+            if (d > 0.001f) changed = true;
+        }
+        // Expected: output B should be 3x output A
+        bool correct_3x = true;
+        float max_3x_err = 0;
+        for (int i = 0; i < CH*SP; i++) {
+            float expected = outA_copy[i] * 3.0f;
+            float err = fabsf(outB[i] - expected);
+            if (err > max_3x_err) max_3x_err = err;
+            if (err > 0.1f) correct_3x = false;
+        }
+        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
+
+        printf("\n=== RESULT ===\n");
+        printf("  Max A-B diff: %.6f\n", max_diff);
+        printf("  Max 3x error: %.6f\n", max_3x_err);
+        printf("  Compile+load: %.1fms | Unload: %.1fms | Reload: %.1fms\n", compile_ms, unload_ms, reload_ms);
+
+        if (changed && correct_3x) {
+            printf("\nSUCCESS: Weight reload works! Output matches 3x identity.\n");
+            printf("  Speedup: compile=%.1fms vs reload=%.1fms (%.1fx faster)\n",
+                   compile_ms, unload_ms + reload_ms, compile_ms / (unload_ms + reload_ms));
+            printf(">>> Compilation bottleneck can be eliminated <<<\n");
+        } else if (changed && !correct_3x) {
+            printf("\nPARTIAL: Output changed but doesn't match expected 3x.\n");
+        } else {
+            printf("\nFAIL: Output did NOT change. Weight reload does not work.\n");
+            printf("  ANE cached the compiled model — weights baked at compile time.\n");
+            printf(">>> Need alternative: weightsBuffer IOSurface or async recompile <<<\n");
+        }
+
+        // Cleanup
+        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+        [fm removeItemAtPath:td error:nil];
+        CFRelease(ioIn); CFRelease(ioOut);
+        free(outA_copy); free(weightsA); free(weightsB);
+    }
+    return 0;
+}
diff --git a/training/tiny_train.m b/training/tiny_train.m
index e1e9d7d..ba90951 100644
--- a/training/tiny_train.m
+++ b/training/tiny_train.m
@@ -1,593 +1,597 @@
-// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward)
-// y = W2 @ relu(W1 @ x), MSE loss, SGD update
-// Pipeline: compile next kernels on background thread while ANE runs current batch
-// Bypasses ANE 119-compile limit via exec() self-restart
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-#include <math.h>
-#include <unistd.h>
-#include <dispatch/dispatch.h>
-
-static Class g_D, g_I, g_AR, g_AIO;
-
-static void ane_init(void) {
-    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_I  = NSClassFromString(@"_ANEInMemoryModel");
-    g_AR = NSClassFromString(@"_ANERequest");
-    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-}
-
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-static NSData *build_blob(const float *w, int rows, int cols) {
-    int wsize = rows * cols * 2;
-    int total = 128 + wsize;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0] = 0x01; buf[4] = 0x02;
-    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
-    buf[68] = 0x01;
-    *(uint32_t*)(buf+72) = wsize;
-    *(uint32_t*)(buf+80) = 128;
-    _Float16 *fp16 = (_Float16*)(buf + 128);
-    for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i];
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-static NSData *build_blob_transposed(const float *w, int rows, int cols) {
-    int wsize = cols * rows * 2;
-    int total = 128 + wsize;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0] = 0x01; buf[4] = 0x02;
-    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
-    buf[68] = 0x01;
-    *(uint32_t*)(buf+72) = wsize;
-    *(uint32_t*)(buf+80) = 128;
-    _Float16 *fp16 = (_Float16*)(buf + 128);
-    for (int i = 0; i < rows; i++)
-        for (int j = 0; j < cols; j++)
-            fp16[j * rows + i] = (_Float16)w[i * cols + j];
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
-    return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
-        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
-        "    } -> (y);\n}\n",
-        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
-}
-
-typedef struct {
-    void *model;    // CFBridgingRetain'd _ANEInMemoryModel
-    IOSurfaceRef ioIn, ioOut;
-    void *request;  // CFBridgingRetain'd _ANERequest
-    void *tmpDir;   // CFBridgingRetain'd NSString
-} Kern;
-
-static int g_compile_count = 0;
-
-static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
-    @autoreleasepool {
-    NSString *mil = gen_conv_mil(in_ch, out_ch, sp);
-    NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
-    NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}};
-    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil);
-    if (!desc) return NULL;
-    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-    NSFileManager *fm = [NSFileManager defaultManager];
-    [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
-    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
-    NSError *e = nil;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
-    __sync_fetch_and_add(&g_compile_count, 1);
-    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
-    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
-    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
-    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
-    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
-    Kern *k = calloc(1, sizeof(Kern));
-    k->model = CFBridgingRetain(mdl);
-    k->ioIn = ioI; k->ioOut = ioO;
-    k->request = CFBridgingRetain(req);
-    k->tmpDir = CFBridgingRetain(td);
-    return k;
-    }
-}
-
-static void free_kern(Kern *k) {
-    if (!k) return;
-    id mdl = (__bridge id)k->model;
-    NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
-    CFRelease(k->ioIn); CFRelease(k->ioOut);
-    NSString *td = (__bridge id)k->tmpDir;
-    [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
-    CFRelease(k->model);
-    CFRelease(k->request);
-    CFRelease(k->tmpDir);
-    free(k);
-}
-
-static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
-    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < in_ch; c++)
-            tmp[c*sp + t] = in[t*in_ch + c];
-    IOSurfaceLock(k->ioIn, 0, NULL);
-    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
-    IOSurfaceUnlock(k->ioIn, 0, NULL);
-    free(tmp);
-    NSError *e = nil;
-    id mdl = (__bridge id)k->model;
-    id req = (__bridge id)k->request;
-    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-        mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
-    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
-    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < out_ch; c++)
-            out[t*out_ch + c] = tmp2[c*sp + t];
-    free(tmp2);
-}
-
-// === Checkpoint: save/restore training state for exec() restart ===
-#define CKPT_PATH "/tmp/ane_train_ckpt.bin"
-
-typedef struct {
-    int step;
-    float loss;
-    int D, H, S, total_steps;
-    float lr;
-    double cum_compile_ms, cum_train_ms, cum_wall_ms;
-    int cum_steps, cum_batches;
-} CkptHeader;
-
-static void save_checkpoint(const char *path, int step, float loss,
-                            int D, int H, int S, int total_steps, float lr,
-                            const float *W1, const float *W2,
-                            double cc, double ct, double cw, int cs, int cb) {
-    FILE *f = fopen(path, "wb");
-    CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb};
-    fwrite(&hdr, sizeof(hdr), 1, f);
-    fwrite(W1, sizeof(float), H * D, f);
-    fwrite(W2, sizeof(float), D * H, f);
-    fclose(f);
-}
-
-static bool load_checkpoint(const char *path, CkptHeader *hdr,
-                            float *W1, float *W2, int H, int D) {
-    FILE *f = fopen(path, "rb");
-    if (!f) return false;
-    fread(hdr, sizeof(CkptHeader), 1, f);
-    fread(W1, sizeof(float), H * D, f);
-    fread(W2, sizeof(float), D * H, f);
-    fclose(f);
-    return true;
-}
-
-#define MAX_COMPILES 100
-#define KERNELS_PER_STEP 4
-#define ACCUM_STEPS 10
-
-// === Pipeline: background compile via GCD ===
-typedef struct {
-    Kern *k1_fwd, *k2_fwd, *k1_bwd, *k2_bwd;
-    float *W1, *W2;
-    int D, H, S;
-    bool ok;
-    double compile_ms;
-} PipelineCompile;
-
-static double tb_to_ms(uint64_t elapsed, mach_timebase_info_data_t tb) {
-    return (double)elapsed * tb.numer / tb.denom / 1e6;
-}
-
-static mach_timebase_info_data_t g_tb;
-// Serial queue ensures ANE compiles don't overlap with each other
-static dispatch_queue_t g_compile_queue;
-
-int main(int argc, char *argv[]) {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        ane_init();
-        mach_timebase_info(&g_tb);
-        g_compile_queue = dispatch_queue_create("ane.compile", DISPATCH_QUEUE_SERIAL);
-
-        int D = 64, H = 128, S = 16;
-        int total_steps = 2000;
-        float lr = 1.0f;
-        int start_step = 0;
-        bool resuming = false;
-
-        float *W1 = (float*)malloc(H * D * sizeof(float));
-        float *W2 = (float*)malloc(D * H * sizeof(float));
-
-        if (argc > 1 && strcmp(argv[1], "--resume") == 0) {
-            CkptHeader hdr;
-            if (load_checkpoint(CKPT_PATH, &hdr, W1, W2, H, D)) {
-                start_step = hdr.step;
-                total_steps = hdr.total_steps;
-                lr = hdr.lr;
-                resuming = true;
-                printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss);
-            }
-        }
-
-        // Cumulative stats (restored from checkpoint if resuming)
-        double cum_compile_ms = 0, cum_train_ms = 0, cum_wall_ms = 0;
-        int cum_steps = 0, cum_batches = 0;
-        if (resuming) {
-            CkptHeader hdr2;
-            FILE *f = fopen(CKPT_PATH, "rb");
-            if (f) { fread(&hdr2, sizeof(hdr2), 1, f); fclose(f);
-                cum_compile_ms = hdr2.cum_compile_ms;
-                cum_train_ms = hdr2.cum_train_ms;
-                cum_wall_ms = hdr2.cum_wall_ms;
-                cum_steps = hdr2.cum_steps;
-                cum_batches = hdr2.cum_batches;
-            }
-        }
-
-        // FLOPs calculation
-        // Forward: W1[H,D] @ x[D,S] = 2*H*D*S, W2[D,H] @ h[H,S] = 2*D*H*S → total fwd = 4*D*H*S
-        // Backward dx: W2^T[H,D] @ dy[D,S] = 2*H*D*S, W1^T[D,H] @ dh[H,S] = 2*D*H*S → total bwd = 4*D*H*S
-        // dW (CPU): dW2[D,H] = dy[D,S] @ h^T[S,H] = 2*D*S*H, dW1 same → total dW = 4*D*H*S
-        // ANE FLOPs per step = 8*D*H*S (fwd + bwd on ANE)
-        // CPU FLOPs per step = 4*D*H*S (dW accumulation)
-        // Total FLOPs per step = 12*D*H*S
-        double ane_flops_per_step = 8.0 * D * H * S;
-        double cpu_flops_per_step = 4.0 * D * H * S;
-        double total_flops_per_step = ane_flops_per_step + cpu_flops_per_step;
-        double weight_bytes = (H*D + D*H) * 2.0; // FP16 weights on ANE
-
-        if (!resuming) {
-            for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f);
-            for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f);
-            printf("=== ANE Training: Pipeline Parallel + Grad Accumulation ===\n");
-            printf("x:[%d,%d] -> W1:[%d,%d] -> ReLU -> W2:[%d,%d] -> y:[%d,%d]\n", S,D, H,D, D,H, S,D);
-            printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", ACCUM_STEPS);
-            printf("ANE FP16 peak: 15.8 TFLOPS (M4) | Weights: %.1f KB\n\n", weight_bytes/1024.0);
-            printf("FLOPs/step: ANE=%.0f (fwd+bwd)  CPU=%.0f (dW)  Total=%.0f\n",
-                   ane_flops_per_step, cpu_flops_per_step, total_flops_per_step);
-            printf("Steps: %d, LR: %.4f, exec() budget: %d compiles\n\n",
-                   total_steps, lr, MAX_COMPILES);
-        }
-
-        float *x = (float*)calloc(S * D, sizeof(float));
-        float *y_target = (float*)calloc(S * D, sizeof(float));
-        for (int t = 0; t < S; t++)
-            for (int i = 0; i < D; i++) {
-                float v = sinf((t * D + i) * 0.1f);
-                x[t*D + i] = v;
-                y_target[t*D + i] = v;
-            }
-
-        float *h = (float*)malloc(S * H * sizeof(float));
-        float *h_relu = (float*)malloc(S * H * sizeof(float));
-        float *y = (float*)malloc(S * D * sizeof(float));
-        float *dy = (float*)malloc(S * D * sizeof(float));
-        float *dh_relu = (float*)malloc(S * H * sizeof(float));
-        float *dh = (float*)malloc(S * H * sizeof(float));
-        float *dx_layer = (float*)malloc(S * D * sizeof(float));
-
-        Kern *k1_fwd = NULL, *k2_fwd = NULL;
-        Kern *k1_bwd = NULL, *k2_bwd = NULL;
-        float last_loss = 999.0f;
-
-        // Stats
-        double total_compile_ms = 0, total_train_ms = 0, total_wall_ms = 0;
-        double total_hidden_compile_ms = 0; // compile time hidden by pipeline
-        int total_batches = 0;
-        int total_steps_done = 0;
-        uint64_t t_wall_start = mach_absolute_time();
-
-        // First compile is synchronous (no pipeline yet)
-        {
-            uint64_t t0 = mach_absolute_time();
-            k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
-            k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
-            k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
-            k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
-            double cms = tb_to_ms(mach_absolute_time() - t0, g_tb);
-            total_compile_ms += cms;
-            if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
-                printf("Initial compile failed!\n"); return 1;
-            }
-            printf("Initial compile: %.0fms\n", cms);
-        }
-
-        int step = start_step;
-        while (step < total_steps) {
-            // Check compile budget
-            if (g_compile_count + KERNELS_PER_STEP > MAX_COMPILES) {
-                free_kern(k1_fwd); free_kern(k2_fwd);
-                free_kern(k1_bwd); free_kern(k2_bwd);
-                save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2,
-                                    cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms,
-                                    cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb),
-                                    cum_steps + total_steps_done, cum_batches + total_batches);
-                double wall = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb);
-                printf("[exec() restart at step %d, %d compiles, loss=%.6f, wall=%.0fms]\n",
-                       step, g_compile_count, last_loss, wall);
-                fflush(stdout);
-                execl(argv[0], argv[0], "--resume", NULL);
-                perror("execl failed"); return 1;
-            }
-
-            // === Run ACCUM_STEPS with current kernels ===
-            float *aW1 = (float*)calloc(H * D, sizeof(float));
-            float *aW2 = (float*)calloc(D * H, sizeof(float));
-            int steps_this_batch = 0;
-
-            // Pipeline: start compiling NEXT batch's kernels in background
-            // We'll apply gradients first, then launch compile with updated W
-            // But for pipeline, we compile AHEAD: while running batch N, compile for N+1
-            // So we need to update weights BEFORE launching background compile
-
-            uint64_t t_batch = mach_absolute_time();
-            for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) {
-                ane_eval_k(k1_fwd, x, h, D, H, S);
-                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
-                ane_eval_k(k2_fwd, h_relu, y, H, D, S);
-
-                float loss = 0;
-                for (int i = 0; i < S*D; i++) {
-                    float diff = y[i] - y_target[i];
-                    loss += diff * diff;
-                    dy[i] = 2.0f * diff / (S * D);
-                }
-                loss /= (S * D);
-                last_loss = loss;
-
-                ane_eval_k(k2_bwd, dy, dh_relu, D, H, S);
-                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
-                ane_eval_k(k1_bwd, dh, dx_layer, H, D, S);
-
-                for (int t = 0; t < S; t++)
-                    for (int i = 0; i < D; i++)
-                        for (int j = 0; j < H; j++)
-                            aW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
-                for (int t = 0; t < S; t++)
-                    for (int i = 0; i < H; i++)
-                        for (int j = 0; j < D; j++)
-                            aW1[i*D + j] += dh[t*H + i] * x[t*D + j];
-
-                steps_this_batch++;
-            }
-            double batch_ms = tb_to_ms(mach_absolute_time() - t_batch, g_tb);
-            total_train_ms += batch_ms;
-
-            // Apply accumulated gradients
-            float scale = 1.0f / steps_this_batch;
-            for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1[i] * scale;
-            for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2[i] * scale;
-            free(aW1); free(aW2);
-
-            total_steps_done += steps_this_batch;
-            total_batches++;
-
-            // Print progress
-            double step_ms = batch_ms / steps_this_batch;
-            double ane_gflops = (ane_flops_per_step * steps_this_batch) / (batch_ms * 1e6);
-            double total_gflops = (total_flops_per_step * steps_this_batch) / (batch_ms * 1e6);
-
-            if (total_batches % 5 == 1 || total_batches <= 2 || step >= total_steps) {
-                printf("step %-5d loss=%-10.6f  %5.1fms/step  ANE=%.2f GFLOPS  total=%.2f GFLOPS  compiles=%d\n",
-                       step - steps_this_batch, last_loss, step_ms, ane_gflops, total_gflops, g_compile_count);
-            }
-
-            // Pipeline: launch background compile with updated weights,
-            // then immediately start NEXT batch's ANE evals with OLD kernels
-            // while compile runs concurrently on GCD queue
-            bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= MAX_COMPILES);
-
-            if (can_pipeline) {
-                // Snapshot weights for background compile
-                PipelineCompile *pc = calloc(1, sizeof(PipelineCompile));
-                pc->W1 = (float*)malloc(H * D * sizeof(float));
-                pc->W2 = (float*)malloc(D * H * sizeof(float));
-                memcpy(pc->W1, W1, H * D * sizeof(float));
-                memcpy(pc->W2, W2, D * H * sizeof(float));
-                pc->D = D; pc->H = H; pc->S = S;
-
-                dispatch_semaphore_t sem = dispatch_semaphore_create(0);
-
-                dispatch_async(g_compile_queue, ^{
-                    @autoreleasepool {
-                        uint64_t t0 = mach_absolute_time();
-                        pc->k1_fwd = compile_kern_with_blob(build_blob(pc->W1, pc->H, pc->D), pc->D, pc->H, pc->S);
-                        pc->k2_fwd = compile_kern_with_blob(build_blob(pc->W2, pc->D, pc->H), pc->H, pc->D, pc->S);
-                        pc->k2_bwd = compile_kern_with_blob(build_blob_transposed(pc->W2, pc->D, pc->H), pc->D, pc->H, pc->S);
-                        pc->k1_bwd = compile_kern_with_blob(build_blob_transposed(pc->W1, pc->H, pc->D), pc->H, pc->D, pc->S);
-                        pc->compile_ms = tb_to_ms(mach_absolute_time() - t0, g_tb);
-                        pc->ok = pc->k1_fwd && pc->k2_fwd && pc->k1_bwd && pc->k2_bwd;
-                        dispatch_semaphore_signal(sem);
-                    }
-                });
-
-                // === While compile runs in background, do ANOTHER batch with OLD kernels ===
-                if (step < total_steps && k1_fwd && k2_fwd && k1_bwd && k2_bwd) {
-                    float *aW1b = (float*)calloc(H * D, sizeof(float));
-                    float *aW2b = (float*)calloc(D * H, sizeof(float));
-                    int steps_overlap = 0;
-                    uint64_t t_overlap = mach_absolute_time();
-
-                    for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) {
-                        ane_eval_k(k1_fwd, x, h, D, H, S);
-                        for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
-                        ane_eval_k(k2_fwd, h_relu, y, H, D, S);
-
-                        float loss = 0;
-                        for (int i = 0; i < S*D; i++) {
-                            float diff = y[i] - y_target[i];
-                            loss += diff * diff;
-                            dy[i] = 2.0f * diff / (S * D);
-                        }
-                        loss /= (S * D);
-                        last_loss = loss;
-
-                        ane_eval_k(k2_bwd, dy, dh_relu, D, H, S);
-                        for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
-                        ane_eval_k(k1_bwd, dh, dx_layer, H, D, S);
-
-                        for (int t = 0; t < S; t++)
-                            for (int i = 0; i < D; i++)
-                                for (int j = 0; j < H; j++)
-                                    aW2b[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
-                        for (int t = 0; t < S; t++)
-                            for (int i = 0; i < H; i++)
-                                for (int j = 0; j < D; j++)
-                                    aW1b[i*D + j] += dh[t*H + i] * x[t*D + j];
-                        steps_overlap++;
-                    }
-                    double overlap_ms = tb_to_ms(mach_absolute_time() - t_overlap, g_tb);
-                    total_train_ms += overlap_ms;
-                    total_steps_done += steps_overlap;
-                    total_batches++;
-
-                    // Apply these gradients with reduced LR (stale weights — 1 batch behind)
-                    float sc = 0.5f / steps_overlap; // half LR for stale batch
-                    for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1b[i] * sc;
-                    for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2b[i] * sc;
-                    free(aW1b); free(aW2b);
-
-                    if (total_batches % 5 == 1) {
-                        double sm = overlap_ms / steps_overlap;
-                        printf("step %-5d loss=%-10.6f  %5.1fms/step  (overlapped with compile)  compiles=%d\n",
-                               step - steps_overlap, last_loss, sm, g_compile_count);
-                    }
-                }
-
-                // Wait for compile to finish
-                dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
-                total_compile_ms += pc->compile_ms;
-                total_hidden_compile_ms += pc->compile_ms; // all hidden behind train
-
-                free_kern(k1_fwd); free_kern(k2_fwd);
-                free_kern(k1_bwd); free_kern(k2_bwd);
-
-                if (pc->ok) {
-                    k1_fwd = pc->k1_fwd; k2_fwd = pc->k2_fwd;
-                    k1_bwd = pc->k1_bwd; k2_bwd = pc->k2_bwd;
-                } else {
-                    k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
-                }
-                free(pc->W1); free(pc->W2); free(pc);
-            } else if (step < total_steps) {
-                // Synchronous compile (no budget for pipeline)
-                uint64_t t0 = mach_absolute_time();
-                free_kern(k1_fwd); free_kern(k2_fwd);
-                free_kern(k1_bwd); free_kern(k2_bwd);
-                k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
-                k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
-                k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
-                k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
-                double cms = tb_to_ms(mach_absolute_time() - t0, g_tb);
-                total_compile_ms += cms;
-                if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
-                    save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2,
-                                    cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms,
-                                    cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb),
-                                    cum_steps + total_steps_done, cum_batches + total_batches);
-                    fflush(stdout);
-                    execl(argv[0], argv[0], "--resume", NULL);
-                    perror("execl failed"); return 1;
-                }
-            }
-
-            if (last_loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; }
-        }
-
-        total_wall_ms = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb);
-        // Add cumulative from previous exec() runs
-        total_compile_ms += cum_compile_ms;
-        total_train_ms += cum_train_ms;
-        total_wall_ms += cum_wall_ms;
-        total_steps_done += cum_steps;
-        total_batches += cum_batches;
-
-        // === Final output ===
-        printf("\nFinal output vs target (first 8):\n");
-        if (k1_fwd && k2_fwd) {
-            ane_eval_k(k1_fwd, x, h, D, H, S);
-            for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
-            ane_eval_k(k2_fwd, h_relu, y, H, D, S);
-        }
-        printf("  y:      "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n");
-        printf("  target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n");
-
-        // === Efficiency Report ===
-        printf("\n=== Efficiency Report ===\n");
-        printf("Total steps:     %d\n", total_steps_done);
-        printf("Total batches:   %d (accum %d steps each)\n", total_batches, ACCUM_STEPS);
-        printf("Wall time:       %.0f ms\n", total_wall_ms);
-        printf("Compile time:    %.0f ms (%.1f%%)\n", total_compile_ms, 100.0*total_compile_ms/total_wall_ms);
-        printf("Train time:      %.0f ms (%.1f%%)\n", total_train_ms, 100.0*total_train_ms/total_wall_ms);
-        printf("Overhead:        %.0f ms (%.1f%%)\n",
-               total_wall_ms - total_compile_ms - total_train_ms,
-               100.0*(total_wall_ms - total_compile_ms - total_train_ms)/total_wall_ms);
-        printf("\n");
-        printf("Avg compile:     %.1f ms per batch (4 kernels)\n", total_compile_ms / total_batches);
-        printf("Avg train:       %.2f ms per step (ANE fwd+bwd + CPU dW)\n", total_train_ms / total_steps_done);
-        printf("Avg wall/step:   %.2f ms\n", total_wall_ms / total_steps_done);
-        printf("\n");
-        double ane_total_flops = ane_flops_per_step * total_steps_done;
-        double cpu_total_flops = cpu_flops_per_step * total_steps_done;
-        printf("ANE FLOPs total: %.3f MFLOP  (%.2f GFLOPS sustained)\n",
-               ane_total_flops / 1e6, ane_total_flops / (total_train_ms * 1e6));
-        printf("CPU FLOPs total: %.3f MFLOP  (%.2f GFLOPS sustained)\n",
-               cpu_total_flops / 1e6, cpu_total_flops / (total_train_ms * 1e6));
-        printf("Total FLOPs:     %.3f MFLOP  (%.2f GFLOPS sustained)\n",
-               (ane_total_flops + cpu_total_flops) / 1e6,
-               (ane_total_flops + cpu_total_flops) / (total_train_ms * 1e6));
-        printf("\n");
-        printf("ANE utilization: %.4f%% of 15.8 TFLOPS peak\n",
-               100.0 * ane_total_flops / (total_train_ms * 1e6) / 15800.0);
-        printf("Weight params:   %d (%.1f KB FP16)\n",
-               H*D + D*H, weight_bytes / 1024.0);
-        printf("Compile amortization: %.1f ms compile / %d steps = %.2f ms/step overhead\n",
-               total_compile_ms / total_batches, ACCUM_STEPS,
-               total_compile_ms / total_batches / ACCUM_STEPS);
-        printf("Compile fraction: %.1f%% of wall time\n", 100.0 * total_compile_ms / total_wall_ms);
-        printf("Train fraction:   %.1f%% of wall time (useful work)\n", 100.0 * total_train_ms / total_wall_ms);
-
-        free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd);
-        free(W1); free(W2); free(x); free(y_target);
-        free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer);
-        unlink(CKPT_PATH);
-    }
-    return 0;
-}
+// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward)
+// y = W2 @ relu(W1 @ x), MSE loss, SGD update
+// Pipeline: compile next kernels on background thread while ANE runs current batch
+// Bypasses ANE 119-compile limit via exec() self-restart
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+#include <unistd.h>
+#include "ane_compat.h"
+#include <dispatch/dispatch.h>
+
+static Class g_D, g_I, g_AR, g_AIO;
+
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+static NSData *build_blob(const float *w, int rows, int cols) {
+    int wsize = rows * cols * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+static NSData *build_blob_transposed(const float *w, int rows, int cols) {
+    int wsize = cols * rows * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            fp16[j * rows + i] = (_Float16)w[i * cols + j];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
+    return [NSString stringWithFormat:
+        @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
+        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "    } -> (y);\n}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
+        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
+}
+
+typedef struct {
+    void *model;    // CFBridgingRetain'd _ANEInMemoryModel
+    IOSurfaceRef ioIn, ioOut;
+    void *request;  // CFBridgingRetain'd _ANERequest
+    void *tmpDir;   // CFBridgingRetain'd NSString
+} Kern;
+
+static int g_compile_count = 0;
+
+static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
+    @autoreleasepool {
+    NSString *mil = gen_conv_mil(in_ch, out_ch, sp);
+    NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}};
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil);
+    if (!desc) return NULL;
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    NSFileManager *fm = [NSFileManager defaultManager];
+    [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    __sync_fetch_and_add(&g_compile_count, 1);
+    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
+    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
+    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+    Kern *k = calloc(1, sizeof(Kern));
+    k->model = CFBridgingRetain(mdl);
+    k->ioIn = ioI; k->ioOut = ioO;
+    k->request = CFBridgingRetain(req);
+    k->tmpDir = CFBridgingRetain(td);
+    return k;
+    }
+}
+
+static void free_kern(Kern *k) {
+    if (!k) return;
+    id mdl = (__bridge id)k->model;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
+    CFRelease(k->ioIn); CFRelease(k->ioOut);
+    NSString *td = (__bridge id)k->tmpDir;
+    [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+    CFRelease(k->model);
+    CFRelease(k->request);
+    CFRelease(k->tmpDir);
+    free(k);
+}
+
+static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
+    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < in_ch; c++)
+            tmp[c*sp + t] = in[t*in_ch + c];
+    IOSurfaceLock(k->ioIn, 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioIn, 0, NULL);
+    free(tmp);
+    NSError *e = nil;
+    id mdl = (__bridge id)k->model;
+    id req = (__bridge id)k->request;
+    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
+    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
+    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < out_ch; c++)
+            out[t*out_ch + c] = tmp2[c*sp + t];
+    free(tmp2);
+}
+
+// === Checkpoint: save/restore training state for exec() restart ===
+#define CKPT_PATH "/tmp/ane_train_ckpt.bin"
+
+typedef struct {
+    int step;
+    float loss;
+    int D, H, S, total_steps;
+    float lr;
+    double cum_compile_ms, cum_train_ms, cum_wall_ms;
+    int cum_steps, cum_batches;
+} CkptHeader;
+
+static void save_checkpoint(const char *path, int step, float loss,
+                            int D, int H, int S, int total_steps, float lr,
+                            const float *W1, const float *W2,
+                            double cc, double ct, double cw, int cs, int cb) {
+    FILE *f = fopen(path, "wb");
+    CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb};
+    fwrite(&hdr, sizeof(hdr), 1, f);
+    fwrite(W1, sizeof(float), H * D, f);
+    fwrite(W2, sizeof(float), D * H, f);
+    fclose(f);
+}
+
+static bool load_checkpoint(const char *path, CkptHeader *hdr,
+                            float *W1, float *W2, int H, int D) {
+    FILE *f = fopen(path, "rb");
+    if (!f) return false;
+    fread(hdr, sizeof(CkptHeader), 1, f);
+    fread(W1, sizeof(float), H * D, f);
+    fread(W2, sizeof(float), D * H, f);
+    fclose(f);
+    return true;
+}
+
+#define MAX_COMPILES 100
+#define KERNELS_PER_STEP 4
+#define ACCUM_STEPS 10
+
+// === Pipeline: background compile via GCD ===
+typedef struct {
+    Kern *k1_fwd, *k2_fwd, *k1_bwd, *k2_bwd;
+    float *W1, *W2;
+    int D, H, S;
+    bool ok;
+    double compile_ms;
+} PipelineCompile;
+
+static double tb_to_ms(uint64_t elapsed, mach_timebase_info_data_t tb) {
+    return (double)elapsed * tb.numer / tb.denom / 1e6;
+}
+
+static mach_timebase_info_data_t g_tb;
+// Serial queue ensures ANE compiles don't overlap with each other
+static dispatch_queue_t g_compile_queue;
+
+int main(int argc, char *argv[]) {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        ane_detect_platform();
+        ane_print_platform();
+        mach_timebase_info(&g_tb);
+        g_compile_queue = dispatch_queue_create("ane.compile", DISPATCH_QUEUE_SERIAL);
+
+        int D = 64, H = 128, S = 16;
+        int total_steps = 2000;
+        float lr = 1.0f;
+        int start_step = 0;
+        bool resuming = false;
+
+        float *W1 = (float*)malloc(H * D * sizeof(float));
+        float *W2 = (float*)malloc(D * H * sizeof(float));
+
+        if (argc > 1 && strcmp(argv[1], "--resume") == 0) {
+            CkptHeader hdr;
+            if (load_checkpoint(CKPT_PATH, &hdr, W1, W2, H, D)) {
+                start_step = hdr.step;
+                total_steps = hdr.total_steps;
+                lr = hdr.lr;
+                resuming = true;
+                printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss);
+            }
+        }
+
+        // Cumulative stats (restored from checkpoint if resuming)
+        double cum_compile_ms = 0, cum_train_ms = 0, cum_wall_ms = 0;
+        int cum_steps = 0, cum_batches = 0;
+        if (resuming) {
+            CkptHeader hdr2;
+            FILE *f = fopen(CKPT_PATH, "rb");
+            if (f) { fread(&hdr2, sizeof(hdr2), 1, f); fclose(f);
+                cum_compile_ms = hdr2.cum_compile_ms;
+                cum_train_ms = hdr2.cum_train_ms;
+                cum_wall_ms = hdr2.cum_wall_ms;
+                cum_steps = hdr2.cum_steps;
+                cum_batches = hdr2.cum_batches;
+            }
+        }
+
+        // FLOPs calculation
+        // Forward: W1[H,D] @ x[D,S] = 2*H*D*S, W2[D,H] @ h[H,S] = 2*D*H*S → total fwd = 4*D*H*S
+        // Backward dx: W2^T[H,D] @ dy[D,S] = 2*H*D*S, W1^T[D,H] @ dh[H,S] = 2*D*H*S → total bwd = 4*D*H*S
+        // dW (CPU): dW2[D,H] = dy[D,S] @ h^T[S,H] = 2*D*S*H, dW1 same → total dW = 4*D*H*S
+        // ANE FLOPs per step = 8*D*H*S (fwd + bwd on ANE)
+        // CPU FLOPs per step = 4*D*H*S (dW accumulation)
+        // Total FLOPs per step = 12*D*H*S
+        double ane_flops_per_step = 8.0 * D * H * S;
+        double cpu_flops_per_step = 4.0 * D * H * S;
+        double total_flops_per_step = ane_flops_per_step + cpu_flops_per_step;
+        double weight_bytes = (H*D + D*H) * 2.0; // FP16 weights on ANE
+
+        if (!resuming) {
+            for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f);
+            for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f);
+            printf("=== ANE Training: Pipeline Parallel + Grad Accumulation ===\n");
+            printf("x:[%d,%d] -> W1:[%d,%d] -> ReLU -> W2:[%d,%d] -> y:[%d,%d]\n", S,D, H,D, D,H, S,D);
+            printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", ACCUM_STEPS);
+            printf("ANE FP16 peak: %.1f TFLOPS (%s) | Weights: %.1f KB\n\n", ane_peak_tflops(), g_ane_platform.chip_name, weight_bytes/1024.0);
+            printf("FLOPs/step: ANE=%.0f (fwd+bwd)  CPU=%.0f (dW)  Total=%.0f\n",
+                   ane_flops_per_step, cpu_flops_per_step, total_flops_per_step);
+            printf("Steps: %d, LR: %.4f, exec() budget: %d compiles\n\n",
+                   total_steps, lr, MAX_COMPILES);
+        }
+
+        float *x = (float*)calloc(S * D, sizeof(float));
+        float *y_target = (float*)calloc(S * D, sizeof(float));
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < D; i++) {
+                float v = sinf((t * D + i) * 0.1f);
+                x[t*D + i] = v;
+                y_target[t*D + i] = v;
+            }
+
+        float *h = (float*)malloc(S * H * sizeof(float));
+        float *h_relu = (float*)malloc(S * H * sizeof(float));
+        float *y = (float*)malloc(S * D * sizeof(float));
+        float *dy = (float*)malloc(S * D * sizeof(float));
+        float *dh_relu = (float*)malloc(S * H * sizeof(float));
+        float *dh = (float*)malloc(S * H * sizeof(float));
+        float *dx_layer = (float*)malloc(S * D * sizeof(float));
+
+        Kern *k1_fwd = NULL, *k2_fwd = NULL;
+        Kern *k1_bwd = NULL, *k2_bwd = NULL;
+        float last_loss = 999.0f;
+
+        // Stats
+        double total_compile_ms = 0, total_train_ms = 0, total_wall_ms = 0;
+        double total_hidden_compile_ms = 0; // compile time hidden by pipeline
+        int total_batches = 0;
+        int total_steps_done = 0;
+        uint64_t t_wall_start = mach_absolute_time();
+
+        // First compile is synchronous (no pipeline yet)
+        {
+            uint64_t t0 = mach_absolute_time();
+            k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
+            k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
+            k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
+            k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
+            double cms = tb_to_ms(mach_absolute_time() - t0, g_tb);
+            total_compile_ms += cms;
+            if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
+                printf("Initial compile failed!\n"); return 1;
+            }
+            printf("Initial compile: %.0fms\n", cms);
+        }
+
+        int step = start_step;
+        while (step < total_steps) {
+            // Check compile budget
+            if (g_compile_count + KERNELS_PER_STEP > MAX_COMPILES) {
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+                save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2,
+                                    cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms,
+                                    cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb),
+                                    cum_steps + total_steps_done, cum_batches + total_batches);
+                double wall = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb);
+                printf("[exec() restart at step %d, %d compiles, loss=%.6f, wall=%.0fms]\n",
+                       step, g_compile_count, last_loss, wall);
+                fflush(stdout);
+                execl(argv[0], argv[0], "--resume", NULL);
+                perror("execl failed"); return 1;
+            }
+
+            // === Run ACCUM_STEPS with current kernels ===
+            float *aW1 = (float*)calloc(H * D, sizeof(float));
+            float *aW2 = (float*)calloc(D * H, sizeof(float));
+            int steps_this_batch = 0;
+
+            // Pipeline: start compiling NEXT batch's kernels in background
+            // We'll apply gradients first, then launch compile with updated W
+            // But for pipeline, we compile AHEAD: while running batch N, compile for N+1
+            // So we need to update weights BEFORE launching background compile
+
+            uint64_t t_batch = mach_absolute_time();
+            for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) {
+                ane_eval_k(k1_fwd, x, h, D, H, S);
+                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                ane_eval_k(k2_fwd, h_relu, y, H, D, S);
+
+                float loss = 0;
+                for (int i = 0; i < S*D; i++) {
+                    float diff = y[i] - y_target[i];
+                    loss += diff * diff;
+                    dy[i] = 2.0f * diff / (S * D);
+                }
+                loss /= (S * D);
+                last_loss = loss;
+
+                ane_eval_k(k2_bwd, dy, dh_relu, D, H, S);
+                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+                ane_eval_k(k1_bwd, dh, dx_layer, H, D, S);
+
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < D; i++)
+                        for (int j = 0; j < H; j++)
+                            aW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < H; i++)
+                        for (int j = 0; j < D; j++)
+                            aW1[i*D + j] += dh[t*H + i] * x[t*D + j];
+
+                steps_this_batch++;
+            }
+            double batch_ms = tb_to_ms(mach_absolute_time() - t_batch, g_tb);
+            total_train_ms += batch_ms;
+
+            // Apply accumulated gradients
+            float scale = 1.0f / steps_this_batch;
+            for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1[i] * scale;
+            for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2[i] * scale;
+            free(aW1); free(aW2);
+
+            total_steps_done += steps_this_batch;
+            total_batches++;
+
+            // Print progress
+            double step_ms = batch_ms / steps_this_batch;
+            double ane_gflops = (ane_flops_per_step * steps_this_batch) / (batch_ms * 1e6);
+            double total_gflops = (total_flops_per_step * steps_this_batch) / (batch_ms * 1e6);
+
+            if (total_batches % 5 == 1 || total_batches <= 2 || step >= total_steps) {
+                printf("step %-5d loss=%-10.6f  %5.1fms/step  ANE=%.2f GFLOPS  total=%.2f GFLOPS  compiles=%d\n",
+                       step - steps_this_batch, last_loss, step_ms, ane_gflops, total_gflops, g_compile_count);
+            }
+
+            // Pipeline: launch background compile with updated weights,
+            // then immediately start NEXT batch's ANE evals with OLD kernels
+            // while compile runs concurrently on GCD queue
+            bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= MAX_COMPILES);
+
+            if (can_pipeline) {
+                // Snapshot weights for background compile
+                PipelineCompile *pc = calloc(1, sizeof(PipelineCompile));
+                pc->W1 = (float*)malloc(H * D * sizeof(float));
+                pc->W2 = (float*)malloc(D * H * sizeof(float));
+                memcpy(pc->W1, W1, H * D * sizeof(float));
+                memcpy(pc->W2, W2, D * H * sizeof(float));
+                pc->D = D; pc->H = H; pc->S = S;
+
+                dispatch_semaphore_t sem = dispatch_semaphore_create(0);
+
+                dispatch_async(g_compile_queue, ^{
+                    @autoreleasepool {
+                        uint64_t t0 = mach_absolute_time();
+                        pc->k1_fwd = compile_kern_with_blob(build_blob(pc->W1, pc->H, pc->D), pc->D, pc->H, pc->S);
+                        pc->k2_fwd = compile_kern_with_blob(build_blob(pc->W2, pc->D, pc->H), pc->H, pc->D, pc->S);
+                        pc->k2_bwd = compile_kern_with_blob(build_blob_transposed(pc->W2, pc->D, pc->H), pc->D, pc->H, pc->S);
+                        pc->k1_bwd = compile_kern_with_blob(build_blob_transposed(pc->W1, pc->H, pc->D), pc->H, pc->D, pc->S);
+                        pc->compile_ms = tb_to_ms(mach_absolute_time() - t0, g_tb);
+                        pc->ok = pc->k1_fwd && pc->k2_fwd && pc->k1_bwd && pc->k2_bwd;
+                        dispatch_semaphore_signal(sem);
+                    }
+                });
+
+                // === While compile runs in background, do ANOTHER batch with OLD kernels ===
+                if (step < total_steps && k1_fwd && k2_fwd && k1_bwd && k2_bwd) {
+                    float *aW1b = (float*)calloc(H * D, sizeof(float));
+                    float *aW2b = (float*)calloc(D * H, sizeof(float));
+                    int steps_overlap = 0;
+                    uint64_t t_overlap = mach_absolute_time();
+
+                    for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) {
+                        ane_eval_k(k1_fwd, x, h, D, H, S);
+                        for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                        ane_eval_k(k2_fwd, h_relu, y, H, D, S);
+
+                        float loss = 0;
+                        for (int i = 0; i < S*D; i++) {
+                            float diff = y[i] - y_target[i];
+                            loss += diff * diff;
+                            dy[i] = 2.0f * diff / (S * D);
+                        }
+                        loss /= (S * D);
+                        last_loss = loss;
+
+                        ane_eval_k(k2_bwd, dy, dh_relu, D, H, S);
+                        for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+                        ane_eval_k(k1_bwd, dh, dx_layer, H, D, S);
+
+                        for (int t = 0; t < S; t++)
+                            for (int i = 0; i < D; i++)
+                                for (int j = 0; j < H; j++)
+                                    aW2b[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
+                        for (int t = 0; t < S; t++)
+                            for (int i = 0; i < H; i++)
+                                for (int j = 0; j < D; j++)
+                                    aW1b[i*D + j] += dh[t*H + i] * x[t*D + j];
+                        steps_overlap++;
+                    }
+                    double overlap_ms = tb_to_ms(mach_absolute_time() - t_overlap, g_tb);
+                    total_train_ms += overlap_ms;
+                    total_steps_done += steps_overlap;
+                    total_batches++;
+
+                    // Apply these gradients with reduced LR (stale weights — 1 batch behind)
+                    float sc = 0.5f / steps_overlap; // half LR for stale batch
+                    for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1b[i] * sc;
+                    for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2b[i] * sc;
+                    free(aW1b); free(aW2b);
+
+                    if (total_batches % 5 == 1) {
+                        double sm = overlap_ms / steps_overlap;
+                        printf("step %-5d loss=%-10.6f  %5.1fms/step  (overlapped with compile)  compiles=%d\n",
+                               step - steps_overlap, last_loss, sm, g_compile_count);
+                    }
+                }
+
+                // Wait for compile to finish
+                dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
+                total_compile_ms += pc->compile_ms;
+                total_hidden_compile_ms += pc->compile_ms; // all hidden behind train
+
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+
+                if (pc->ok) {
+                    k1_fwd = pc->k1_fwd; k2_fwd = pc->k2_fwd;
+                    k1_bwd = pc->k1_bwd; k2_bwd = pc->k2_bwd;
+                } else {
+                    k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
+                }
+                free(pc->W1); free(pc->W2); free(pc);
+            } else if (step < total_steps) {
+                // Synchronous compile (no budget for pipeline)
+                uint64_t t0 = mach_absolute_time();
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+                k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
+                k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
+                k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
+                k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
+                double cms = tb_to_ms(mach_absolute_time() - t0, g_tb);
+                total_compile_ms += cms;
+                if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
+                    save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2,
+                                    cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms,
+                                    cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb),
+                                    cum_steps + total_steps_done, cum_batches + total_batches);
+                    fflush(stdout);
+                    execl(argv[0], argv[0], "--resume", NULL);
+                    perror("execl failed"); return 1;
+                }
+            }
+
+            if (last_loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; }
+        }
+
+        total_wall_ms = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb);
+        // Add cumulative from previous exec() runs
+        total_compile_ms += cum_compile_ms;
+        total_train_ms += cum_train_ms;
+        total_wall_ms += cum_wall_ms;
+        total_steps_done += cum_steps;
+        total_batches += cum_batches;
+
+        // === Final output ===
+        printf("\nFinal output vs target (first 8):\n");
+        if (k1_fwd && k2_fwd) {
+            ane_eval_k(k1_fwd, x, h, D, H, S);
+            for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+            ane_eval_k(k2_fwd, h_relu, y, H, D, S);
+        }
+        printf("  y:      "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n");
+        printf("  target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n");
+
+        // === Efficiency Report ===
+        printf("\n=== Efficiency Report ===\n");
+        printf("Total steps:     %d\n", total_steps_done);
+        printf("Total batches:   %d (accum %d steps each)\n", total_batches, ACCUM_STEPS);
+        printf("Wall time:       %.0f ms\n", total_wall_ms);
+        printf("Compile time:    %.0f ms (%.1f%%)\n", total_compile_ms, 100.0*total_compile_ms/total_wall_ms);
+        printf("Train time:      %.0f ms (%.1f%%)\n", total_train_ms, 100.0*total_train_ms/total_wall_ms);
+        printf("Overhead:        %.0f ms (%.1f%%)\n",
+               total_wall_ms - total_compile_ms - total_train_ms,
+               100.0*(total_wall_ms - total_compile_ms - total_train_ms)/total_wall_ms);
+        printf("\n");
+        printf("Avg compile:     %.1f ms per batch (4 kernels)\n", total_compile_ms / total_batches);
+        printf("Avg train:       %.2f ms per step (ANE fwd+bwd + CPU dW)\n", total_train_ms / total_steps_done);
+        printf("Avg wall/step:   %.2f ms\n", total_wall_ms / total_steps_done);
+        printf("\n");
+        double ane_total_flops = ane_flops_per_step * total_steps_done;
+        double cpu_total_flops = cpu_flops_per_step * total_steps_done;
+        printf("ANE FLOPs total: %.3f MFLOP  (%.2f GFLOPS sustained)\n",
+               ane_total_flops / 1e6, ane_total_flops / (total_train_ms * 1e6));
+        printf("CPU FLOPs total: %.3f MFLOP  (%.2f GFLOPS sustained)\n",
+               cpu_total_flops / 1e6, cpu_total_flops / (total_train_ms * 1e6));
+        printf("Total FLOPs:     %.3f MFLOP  (%.2f GFLOPS sustained)\n",
+               (ane_total_flops + cpu_total_flops) / 1e6,
+               (ane_total_flops + cpu_total_flops) / (total_train_ms * 1e6));
+        printf("\n");
+        printf("ANE utilization: %.4f%% of %.1f TFLOPS peak\n",
+               100.0 * ane_total_flops / (total_train_ms * 1e6) / (ane_peak_tflops() * 1000.0), ane_peak_tflops());
+        printf("Weight params:   %d (%.1f KB FP16)\n",
+               H*D + D*H, weight_bytes / 1024.0);
+        printf("Compile amortization: %.1f ms compile / %d steps = %.2f ms/step overhead\n",
+               total_compile_ms / total_batches, ACCUM_STEPS,
+               total_compile_ms / total_batches / ACCUM_STEPS);
+        printf("Compile fraction: %.1f%% of wall time\n", 100.0 * total_compile_ms / total_wall_ms);
+        printf("Train fraction:   %.1f%% of wall time (useful work)\n", 100.0 * total_train_ms / total_wall_ms);
+
+        free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd);
+        free(W1); free(W2); free(x); free(y_target);
+        free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer);
+        unlink(CKPT_PATH);
+    }
+    return 0;
+}
diff --git a/training/tiny_train_old.m b/training/tiny_train_old.m
index c22a90c..54e9ce5 100644
--- a/training/tiny_train_old.m
+++ b/training/tiny_train_old.m
@@ -1,309 +1,313 @@
-// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward)
-// y = W2 @ relu(W1 @ x), MSE loss, SGD update
-// Forward: ANE conv with baked weights
-// Backward dx: ANE conv with transposed baked weights
-// Backward dW: CPU (outer product, memory-bound)
-#import <Foundation/Foundation.h>
-#import <objc/runtime.h>
-#import <objc/message.h>
-#import <dlfcn.h>
-#import <IOSurface/IOSurface.h>
-#import <mach/mach_time.h>
-#include <math.h>
-
-static Class g_D, g_I, g_AR, g_AIO;
-
-static void ane_init(void) {
-    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
-    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
-    g_I  = NSClassFromString(@"_ANEInMemoryModel");
-    g_AR = NSClassFromString(@"_ANERequest");
-    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
-}
-
-static IOSurfaceRef make_surface(size_t bytes) {
-    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
-        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
-        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
-        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
-}
-
-static NSData *build_blob(const float *w, int rows, int cols) {
-    int wsize = rows * cols * 2;
-    int total = 128 + wsize;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0] = 0x01; buf[4] = 0x02;
-    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
-    buf[68] = 0x01;
-    *(uint32_t*)(buf+72) = wsize;
-    *(uint32_t*)(buf+80) = 128;
-    _Float16 *fp16 = (_Float16*)(buf + 128);
-    for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i];
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-// Build blob with TRANSPOSED weights: W[rows,cols] → W^T[cols,rows]
-static NSData *build_blob_transposed(const float *w, int rows, int cols) {
-    int wsize = cols * rows * 2;
-    int total = 128 + wsize;
-    uint8_t *buf = (uint8_t*)calloc(total, 1);
-    buf[0] = 0x01; buf[4] = 0x02;
-    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
-    buf[68] = 0x01;
-    *(uint32_t*)(buf+72) = wsize;
-    *(uint32_t*)(buf+80) = 128;
-    _Float16 *fp16 = (_Float16*)(buf + 128);
-    for (int i = 0; i < rows; i++)
-        for (int j = 0; j < cols; j++)
-            fp16[j * rows + i] = (_Float16)w[i * cols + j]; // transpose
-    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
-}
-
-static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
-    return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
-        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
-        "    } -> (y);\n}\n",
-        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
-}
-
-typedef struct {
-    id model;
-    IOSurfaceRef ioIn, ioOut;
-    id request;
-    NSString *tmpDir;
-} Kern;
-
-static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
-    NSString *mil = gen_conv_mil(in_ch, out_ch, sp);
-    NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
-    NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}};
-    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil);
-    if (!desc) return NULL;
-    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
-    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
-    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-    NSFileManager *fm = [NSFileManager defaultManager];
-    [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
-    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
-    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
-    NSError *e = nil;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
-    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
-    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
-    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
-    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
-    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
-        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
-        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
-    Kern *k = calloc(1, sizeof(Kern));
-    k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td;
-    return k;
-}
-
-static void free_kern(Kern *k) {
-    if (!k) return;
-    NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
-    CFRelease(k->ioIn); CFRelease(k->ioOut);
-    [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
-    free(k);
-}
-
-// ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first
-static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
-    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < in_ch; c++)
-            tmp[c*sp + t] = in[t*in_ch + c];
-    IOSurfaceLock(k->ioIn, 0, NULL);
-    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
-    IOSurfaceUnlock(k->ioIn, 0, NULL);
-    free(tmp);
-    NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e);
-    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
-    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
-    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < out_ch; c++)
-            out[t*out_ch + c] = tmp2[c*sp + t];
-    free(tmp2);
-}
-
-int main(int argc, char *argv[]) {
-    @autoreleasepool {
-        ane_init();
-        mach_timebase_info_data_t tb;
-        mach_timebase_info(&tb);
-
-        int D = 64, H = 128, S = 16;
-        int steps = 25; // 4 kernels × 25 = 100 compiles, under 119 limit
-        float lr = 0.5f;
-        int recompile_every = 1; // recompile every step for correct gradients
-
-        float *W1 = (float*)malloc(H * D * sizeof(float));
-        float *W2 = (float*)malloc(D * H * sizeof(float));
-        for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f);
-        for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f);
-
-        float *x = (float*)calloc(S * D, sizeof(float));
-        float *y_target = (float*)calloc(S * D, sizeof(float));
-        for (int t = 0; t < S; t++)
-            for (int i = 0; i < D; i++) {
-                float v = sinf((t * D + i) * 0.1f);
-                x[t*D + i] = v;
-                y_target[t*D + i] = v;
-            }
-
-        printf("=== Tiny 2-Layer ANE Training (Forward + Backward on ANE) ===\n");
-        printf("x:[%d,%d] → W1:[%d,%d] → ReLU → W2:[%d,%d] → y:[%d,%d]\n", S,D, H,D, D,H, S,D);
-        printf("Forward: ANE conv | Backward dx: ANE conv(W^T) | Backward dW: CPU\n");
-        printf("Steps: %d, LR: %.4f, Recompile every %d steps\n\n", steps, lr, recompile_every);
-
-        float *h = (float*)malloc(S * H * sizeof(float));
-        float *h_relu = (float*)malloc(S * H * sizeof(float));
-        float *y = (float*)malloc(S * D * sizeof(float));
-        float *dy = (float*)malloc(S * D * sizeof(float));
-        float *dh_relu = (float*)malloc(S * H * sizeof(float));
-        float *dh = (float*)malloc(S * H * sizeof(float));
-        float *dx_layer = (float*)malloc(S * D * sizeof(float)); // not used for update but proves backward works
-        float *dW1 = (float*)calloc(H * D, sizeof(float));
-        float *dW2 = (float*)calloc(D * H, sizeof(float));
-
-        // 4 ANE kernels: 2 forward + 2 backward (transposed weights)
-        Kern *k1_fwd = NULL, *k2_fwd = NULL;  // W1: [H,D]→conv(D→H), W2: [D,H]→conv(H→D)
-        Kern *k1_bwd = NULL, *k2_bwd = NULL;  // W1^T: [D,H]→conv(H→D), W2^T: [H,D]→conv(D→H)
-        bool on_ane = true;
-
-        printf("%-6s %-12s %-10s %-6s\n", "Step", "MSE Loss", "ms/step", "Backend");
-        printf("--------------------------------------\n");
-
-        for (int step = 0; step < steps; step++) {
-            uint64_t t0 = mach_absolute_time();
-
-            if (on_ane && step % recompile_every == 0) {
-                free_kern(k1_fwd); free_kern(k2_fwd);
-                free_kern(k1_bwd); free_kern(k2_bwd);
-                k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
-                @autoreleasepool {
-                    k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
-                    k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
-                    // Backward: dx = W^T @ dy → conv with transposed weight
-                    // W2^T: [H,D] as conv weight, input dy [1,D,1,S] → output dh [1,H,1,S]
-                    k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
-                    // W1^T: [D,H] as conv weight, input dh [1,H,1,S] → output dx [1,D,1,S]
-                    k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
-                }
-                if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
-                    printf("ANE limit at step %d, continuing on CPU\n", step);
-                    free_kern(k1_fwd); free_kern(k2_fwd);
-                    free_kern(k1_bwd); free_kern(k2_bwd);
-                    k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
-                    on_ane = false;
-                }
-            }
-
-            if (on_ane) {
-                // === Forward on ANE ===
-                ane_eval(k1_fwd, x, h, D, H, S);
-                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
-                ane_eval(k2_fwd, h_relu, y, H, D, S);
-            } else {
-                for (int t = 0; t < S; t++)
-                    for (int i = 0; i < H; i++) {
-                        float s = 0; for (int j = 0; j < D; j++) s += W1[i*D+j] * x[t*D+j];
-                        h[t*H+i] = s;
-                    }
-                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
-                for (int t = 0; t < S; t++)
-                    for (int i = 0; i < D; i++) {
-                        float s = 0; for (int j = 0; j < H; j++) s += W2[i*H+j] * h_relu[t*H+j];
-                        y[t*D+i] = s;
-                    }
-            }
-
-            // MSE loss + dL/dy
-            float loss = 0;
-            for (int i = 0; i < S*D; i++) {
-                float diff = y[i] - y_target[i];
-                loss += diff * diff;
-                dy[i] = 2.0f * diff / (S * D);
-            }
-            loss /= (S * D);
-
-            if (on_ane) {
-                // === Backward dx on ANE ===
-                // dh_relu = W2^T @ dy (ANE conv with transposed W2)
-                ane_eval(k2_bwd, dy, dh_relu, D, H, S);
-                // ReLU backward (CPU, element-wise)
-                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
-                // dx = W1^T @ dh (ANE conv with transposed W1)
-                ane_eval(k1_bwd, dh, dx_layer, H, D, S);
-            } else {
-                memset(dh_relu, 0, S * H * sizeof(float));
-                for (int t = 0; t < S; t++)
-                    for (int j = 0; j < H; j++)
-                        for (int i = 0; i < D; i++)
-                            dh_relu[t*H + j] += W2[i*H + j] * dy[t*D + i];
-                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
-            }
-
-            // dW on CPU (outer products — memory-bound, not worth ANE)
-            memset(dW2, 0, D * H * sizeof(float));
-            for (int t = 0; t < S; t++)
-                for (int i = 0; i < D; i++)
-                    for (int j = 0; j < H; j++)
-                        dW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
-            memset(dW1, 0, H * D * sizeof(float));
-            for (int t = 0; t < S; t++)
-                for (int i = 0; i < H; i++)
-                    for (int j = 0; j < D; j++)
-                        dW1[i*D + j] += dh[t*H + i] * x[t*D + j];
-
-            // SGD
-            for (int i = 0; i < H*D; i++) W1[i] -= lr * dW1[i];
-            for (int i = 0; i < D*H; i++) W2[i] -= lr * dW2[i];
-
-            double ms = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
-
-            if (step % 1 == 0 || step == steps - 1)
-                printf("%-6d %-12.6f %-10.1f %-6s\n", step, loss, ms, on_ane ? "ANE" : "CPU");
-
-            if (loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; }
-        }
-
-        printf("\nFinal output vs target (first 8):\n");
-        if (on_ane && k1_fwd && k2_fwd) {
-            ane_eval(k1_fwd, x, h, D, H, S);
-            for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
-            ane_eval(k2_fwd, h_relu, y, H, D, S);
-        }
-        printf("  y:      "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n");
-        printf("  target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n");
-
-        free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd);
-        free(W1); free(W2); free(x); free(y_target);
-        free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); free(dW1); free(dW2);
-        printf("\nDone.\n");
-    }
-    return 0;
-}
+// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward)
+// y = W2 @ relu(W1 @ x), MSE loss, SGD update
+// Forward: ANE conv with baked weights
+// Backward dx: ANE conv with transposed baked weights
+// Backward dW: CPU (outer product, memory-bound)
+#import <Foundation/Foundation.h>
+#import <objc/runtime.h>
+#import <objc/message.h>
+#import <dlfcn.h>
+#import <IOSurface/IOSurface.h>
+#import <mach/mach_time.h>
+#include <math.h>
+#include "ane_compat.h"
+
+static Class g_D, g_I, g_AR, g_AIO;
+
+static void ane_init(void) {
+    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
+    g_D  = NSClassFromString(@"_ANEInMemoryModelDescriptor");
+    g_I  = NSClassFromString(@"_ANEInMemoryModel");
+    g_AR = NSClassFromString(@"_ANERequest");
+    g_AIO= NSClassFromString(@"_ANEIOSurfaceObject");
+}
+
+static IOSurfaceRef make_surface(size_t bytes) {
+    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
+        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
+        (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes),
+        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
+}
+
+static NSData *build_blob(const float *w, int rows, int cols) {
+    int wsize = rows * cols * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i];
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+// Build blob with TRANSPOSED weights: W[rows,cols] → W^T[cols,rows]
+static NSData *build_blob_transposed(const float *w, int rows, int cols) {
+    int wsize = cols * rows * 2;
+    int total = 128 + wsize;
+    uint8_t *buf = (uint8_t*)calloc(total, 1);
+    buf[0] = 0x01; buf[4] = 0x02;
+    buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE;
+    buf[68] = 0x01;
+    *(uint32_t*)(buf+72) = wsize;
+    *(uint32_t*)(buf+80) = 128;
+    _Float16 *fp16 = (_Float16*)(buf + 128);
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            fp16[j * rows + i] = (_Float16)w[i * cols + j]; // transpose
+    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
+}
+
+static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
+    return [NSString stringWithFormat:
+        @"program(%s)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"\"}, "
+        "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, "
+        "{\"coremltools-version\", \"\"}})]\n{\n"
+        "    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
+        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "    } -> (y);\n}\n",
+        g_ane_platform.mil_program, ane_mil_target(),
+        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
+}
+
+typedef struct {
+    id model;
+    IOSurfaceRef ioIn, ioOut;
+    id request;
+    NSString *tmpDir;
+} Kern;
+
+static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
+    NSString *mil = gen_conv_mil(in_ch, out_ch, sp);
+    NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
+    NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}};
+    id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil);
+    if (!desc) return NULL;
+    id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
+    id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
+    NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
+    NSFileManager *fm = [NSFileManager defaultManager];
+    [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
+    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
+    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
+    NSError *e = nil;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
+    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
+    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
+    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
+    id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR,
+        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
+        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
+    Kern *k = calloc(1, sizeof(Kern));
+    k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td;
+    return k;
+}
+
+static void free_kern(Kern *k) {
+    if (!k) return;
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    CFRelease(k->ioIn); CFRelease(k->ioOut);
+    [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
+    free(k);
+}
+
+// ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first
+static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
+    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < in_ch; c++)
+            tmp[c*sp + t] = in[t*in_ch + c];
+    IOSurfaceLock(k->ioIn, 0, NULL);
+    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioIn, 0, NULL);
+    free(tmp);
+    NSError *e = nil;
+    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
+        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e);
+    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
+    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
+    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
+    for (int t = 0; t < sp; t++)
+        for (int c = 0; c < out_ch; c++)
+            out[t*out_ch + c] = tmp2[c*sp + t];
+    free(tmp2);
+}
+
+int main(int argc, char *argv[]) {
+    @autoreleasepool {
+        ane_init();
+        ane_detect_platform();
+        ane_print_platform();
+        mach_timebase_info_data_t tb;
+        mach_timebase_info(&tb);
+
+        int D = 64, H = 128, S = 16;
+        int steps = 25; // 4 kernels × 25 = 100 compiles, under 119 limit
+        float lr = 0.5f;
+        int recompile_every = 1; // recompile every step for correct gradients
+
+        float *W1 = (float*)malloc(H * D * sizeof(float));
+        float *W2 = (float*)malloc(D * H * sizeof(float));
+        for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f);
+        for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f);
+
+        float *x = (float*)calloc(S * D, sizeof(float));
+        float *y_target = (float*)calloc(S * D, sizeof(float));
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < D; i++) {
+                float v = sinf((t * D + i) * 0.1f);
+                x[t*D + i] = v;
+                y_target[t*D + i] = v;
+            }
+
+        printf("=== Tiny 2-Layer ANE Training (Forward + Backward on ANE) ===\n");
+        printf("x:[%d,%d] → W1:[%d,%d] → ReLU → W2:[%d,%d] → y:[%d,%d]\n", S,D, H,D, D,H, S,D);
+        printf("Forward: ANE conv | Backward dx: ANE conv(W^T) | Backward dW: CPU\n");
+        printf("Steps: %d, LR: %.4f, Recompile every %d steps\n\n", steps, lr, recompile_every);
+
+        float *h = (float*)malloc(S * H * sizeof(float));
+        float *h_relu = (float*)malloc(S * H * sizeof(float));
+        float *y = (float*)malloc(S * D * sizeof(float));
+        float *dy = (float*)malloc(S * D * sizeof(float));
+        float *dh_relu = (float*)malloc(S * H * sizeof(float));
+        float *dh = (float*)malloc(S * H * sizeof(float));
+        float *dx_layer = (float*)malloc(S * D * sizeof(float)); // not used for update but proves backward works
+        float *dW1 = (float*)calloc(H * D, sizeof(float));
+        float *dW2 = (float*)calloc(D * H, sizeof(float));
+
+        // 4 ANE kernels: 2 forward + 2 backward (transposed weights)
+        Kern *k1_fwd = NULL, *k2_fwd = NULL;  // W1: [H,D]→conv(D→H), W2: [D,H]→conv(H→D)
+        Kern *k1_bwd = NULL, *k2_bwd = NULL;  // W1^T: [D,H]→conv(H→D), W2^T: [H,D]→conv(D→H)
+        bool on_ane = true;
+
+        printf("%-6s %-12s %-10s %-6s\n", "Step", "MSE Loss", "ms/step", "Backend");
+        printf("--------------------------------------\n");
+
+        for (int step = 0; step < steps; step++) {
+            uint64_t t0 = mach_absolute_time();
+
+            if (on_ane && step % recompile_every == 0) {
+                free_kern(k1_fwd); free_kern(k2_fwd);
+                free_kern(k1_bwd); free_kern(k2_bwd);
+                k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
+                @autoreleasepool {
+                    k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S);
+                    k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S);
+                    // Backward: dx = W^T @ dy → conv with transposed weight
+                    // W2^T: [H,D] as conv weight, input dy [1,D,1,S] → output dh [1,H,1,S]
+                    k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S);
+                    // W1^T: [D,H] as conv weight, input dh [1,H,1,S] → output dx [1,D,1,S]
+                    k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S);
+                }
+                if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) {
+                    printf("ANE limit at step %d, continuing on CPU\n", step);
+                    free_kern(k1_fwd); free_kern(k2_fwd);
+                    free_kern(k1_bwd); free_kern(k2_bwd);
+                    k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL;
+                    on_ane = false;
+                }
+            }
+
+            if (on_ane) {
+                // === Forward on ANE ===
+                ane_eval(k1_fwd, x, h, D, H, S);
+                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                ane_eval(k2_fwd, h_relu, y, H, D, S);
+            } else {
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < H; i++) {
+                        float s = 0; for (int j = 0; j < D; j++) s += W1[i*D+j] * x[t*D+j];
+                        h[t*H+i] = s;
+                    }
+                for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+                for (int t = 0; t < S; t++)
+                    for (int i = 0; i < D; i++) {
+                        float s = 0; for (int j = 0; j < H; j++) s += W2[i*H+j] * h_relu[t*H+j];
+                        y[t*D+i] = s;
+                    }
+            }
+
+            // MSE loss + dL/dy
+            float loss = 0;
+            for (int i = 0; i < S*D; i++) {
+                float diff = y[i] - y_target[i];
+                loss += diff * diff;
+                dy[i] = 2.0f * diff / (S * D);
+            }
+            loss /= (S * D);
+
+            if (on_ane) {
+                // === Backward dx on ANE ===
+                // dh_relu = W2^T @ dy (ANE conv with transposed W2)
+                ane_eval(k2_bwd, dy, dh_relu, D, H, S);
+                // ReLU backward (CPU, element-wise)
+                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+                // dx = W1^T @ dh (ANE conv with transposed W1)
+                ane_eval(k1_bwd, dh, dx_layer, H, D, S);
+            } else {
+                memset(dh_relu, 0, S * H * sizeof(float));
+                for (int t = 0; t < S; t++)
+                    for (int j = 0; j < H; j++)
+                        for (int i = 0; i < D; i++)
+                            dh_relu[t*H + j] += W2[i*H + j] * dy[t*D + i];
+                for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0;
+            }
+
+            // dW on CPU (outer products — memory-bound, not worth ANE)
+            memset(dW2, 0, D * H * sizeof(float));
+            for (int t = 0; t < S; t++)
+                for (int i = 0; i < D; i++)
+                    for (int j = 0; j < H; j++)
+                        dW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j];
+            memset(dW1, 0, H * D * sizeof(float));
+            for (int t = 0; t < S; t++)
+                for (int i = 0; i < H; i++)
+                    for (int j = 0; j < D; j++)
+                        dW1[i*D + j] += dh[t*H + i] * x[t*D + j];
+
+            // SGD
+            for (int i = 0; i < H*D; i++) W1[i] -= lr * dW1[i];
+            for (int i = 0; i < D*H; i++) W2[i] -= lr * dW2[i];
+
+            double ms = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6;
+
+            if (step % 1 == 0 || step == steps - 1)
+                printf("%-6d %-12.6f %-10.1f %-6s\n", step, loss, ms, on_ane ? "ANE" : "CPU");
+
+            if (loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; }
+        }
+
+        printf("\nFinal output vs target (first 8):\n");
+        if (on_ane && k1_fwd && k2_fwd) {
+            ane_eval(k1_fwd, x, h, D, H, S);
+            for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0;
+            ane_eval(k2_fwd, h_relu, y, H, D, S);
+        }
+        printf("  y:      "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n");
+        printf("  target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n");
+
+        free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd);
+        free(W1); free(W2); free(x); free(y_target);
+        free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); free(dW1); free(dW2);
+        printf("\nDone.\n");
+    }
+    return 0;
+}
diff --git a/training/train_large.m b/training/train_large.m
index e58ce08..c807352 100644
--- a/training/train_large.m
+++ b/training/train_large.m
@@ -1,687 +1,690 @@
-// train_large.m — Train stories110M (12 layers, 768dim, 3072hidden) on ANE
-// Uses pretokenized TinyStories data with cross-entropy loss
-// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch
-#include "stories_io.h"
-#include "stories_mil.h"
-#include "stories_cpu_ops.h"
-
-#define CKPT_PATH "ane_stories110M_ckpt.bin"
-#define MODEL_PATH "../../assets/models/stories110M.bin"
-#define DATA_PATH "tinystories_data00.bin"
-
-// ===== Weight loading from llama2.c format =====
-static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) {
-    FILE *f = fopen(path, "rb");
-    if (!f) { printf("Cannot open %s\n", path); return false; }
-    Llama2Config cfg;
-    fread(&cfg, sizeof(cfg), 1, f);
-    printf("  Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n",
-           cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len);
-    if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) {
-        printf("  ERROR: Config mismatch! Expected dim=%d hidden=%d layers=%d\n", DIM, HIDDEN, NLAYERS);
-        fclose(f); return false;
-    }
-    int V = abs(cfg.vocab_size);
-    bool shared = cfg.vocab_size > 0;
-
-    // Read in llama2.c order: embed, rms_att[all], wq[all], wk[all], wv[all], wo[all],
-    //                         rms_ffn[all], w1[all], w2[all], w3[all], rms_final, [wcls]
-    fread(embed, 4, V * DIM, f);
-
-    // rms_att weights for all layers (contiguous)
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f);
-    // wq for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f);
-    // wk for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f);
-    // wv for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f);
-    // wo for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f);
-    // rms_ffn weights for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f);
-    // w1 for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f);
-    // w2 for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f);
-    // w3 for all layers
-    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f);
-    // rms_final
-    fread(rms_final, 4, DIM, f);
-    // wcls = embed if shared (we just use embed pointer)
-
-    fclose(f);
-    printf("  Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls");
-    return true;
-}
-
-// ===== Compile one layer's kernels =====
-static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) {
-    lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{
-        @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)},
-        @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)},
-        @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)},
-        @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)},
-        @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)},
-        @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
-    }), DIM*SEQ*2, 6*DIM*SEQ*2);
-
-    lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{
-        @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)},
-        @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)},
-        @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)},
-        @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)},
-    }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2);
-
-    lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{
-        @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)},
-        @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)},
-        @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)},
-    }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2);
-
-    lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{
-        @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
-        @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)},
-    }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2);
-
-    lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{
-        @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)},
-        @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)},
-        @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)},
-    }), 3*DIM*SEQ*2, DIM*SEQ*2);
-
-    return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd;
-}
-
-// Compile weight-free sdpaBwd2 (only needs once, no weights)
-static Kern *compile_sdpa_bwd2(void) {
-    return compile_kern_mil_w(gen_sdpa_bwd2(), @{},
-        (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2);
-}
-
-static void free_layer_kernels(LayerKernels *lk) {
-    free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd);
-    free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd);
-    // sdpaBwd2 is shared, freed separately
-    lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL;
-}
-
-// ===== Checkpoint save/load =====
-static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss,
-                            double cc, double ct, double cw, int cs, int cb, int adam_t,
-                            LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
-                            float *embed, AdamState *aembed) {
-    FILE *f = fopen(path, "wb");
-    CkptHdr h = {0};
-    h.magic = 0x424C5A54; h.version = 2;
-    h.step = step; h.total_steps = total_steps;
-    h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM;
-    h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ;
-    h.lr = lr; h.loss = loss;
-    h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw;
-    h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t;
-    fwrite(&h, sizeof(h), 1, f);
-    // Per-layer weights + adam
-    for (int L = 0; L < NLAYERS; L++) {
-        fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f);
-        fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f);
-        fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f);
-        fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f);
-        // Adam state
-        fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f);
-        fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f);
-        fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f);
-        fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f);
-        fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f);
-        fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f);
-        fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f);
-        fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f);
-        fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f);
-    }
-    fwrite(rms_final,4,DIM,f);
-    fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f);
-    fwrite(embed,4,VOCAB*DIM,f);
-    fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f);
-    fclose(f);
-}
-
-static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss,
-                             double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t,
-                             LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
-                             float *embed, AdamState *aembed) {
-    FILE *f = fopen(path, "rb");
-    if (!f) return false;
-    CkptHdr h;
-    fread(&h, sizeof(h), 1, f);
-    if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; }
-    *step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss;
-    *cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall;
-    *cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t;
-    for (int L = 0; L < NLAYERS; L++) {
-        fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f);
-        fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f);
-        fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f);
-        fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f);
-        fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f);
-        fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f);
-        fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f);
-        fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f);
-        fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f);
-        fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f);
-        fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f);
-        fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f);
-        fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f);
-    }
-    fread(rms_final,4,DIM,f);
-    fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f);
-    fread(embed,4,VOCAB*DIM,f);
-    fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f);
-    fclose(f);
-    return true;
-}
-
-// ===== Main =====
-int main(int argc, char *argv[]) {
-    @autoreleasepool {
-        setbuf(stdout, NULL);
-        ane_init();
-        mach_timebase_info(&g_tb);
-
-        int total_steps = 10000;
-        float lr = 3e-4f;
-        float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f;
-        int adam_t = 0, start_step = 0;
-
-        // Parse args
-        bool do_resume = false;
-        for (int i=1; i<argc; i++) {
-            if (strcmp(argv[i], "--resume") == 0) do_resume = true;
-            else if (strcmp(argv[i], "--steps") == 0 && i+1<argc) total_steps = atoi(argv[++i]);
-            else if (strcmp(argv[i], "--lr") == 0 && i+1<argc) lr = atof(argv[++i]);
-        }
-
-        // Allocate per-layer state
-        LayerWeights lw[NLAYERS];
-        LayerAdam la[NLAYERS];
-        LayerActs acts[NLAYERS];
-        LayerGrads grads[NLAYERS];
-        LayerKernels kern[NLAYERS];
-        for (int L=0; L<NLAYERS; L++) {
-            lw[L] = layer_weights_alloc();
-            la[L] = layer_adam_alloc();
-            acts[L] = layer_acts_alloc();
-            grads[L] = layer_grads_alloc();
-            memset(&kern[L], 0, sizeof(LayerKernels));
-        }
-
-        // Final RMSNorm + embedding + classifier
-        float *rms_final = (float*)malloc(DIM*4);
-        float *embed = (float*)malloc(VOCAB*DIM*4);  // [VOCAB, DIM] row-major
-        float *grms_final = (float*)calloc(DIM, 4);
-        float *gembed = (float*)calloc(VOCAB*DIM, 4);
-        AdamState arms_final = adam_alloc(DIM);
-        AdamState aembed = adam_alloc((size_t)VOCAB*DIM);
-
-        double cum_compile=0, cum_train=0, cum_wall=0;
-        int cum_steps=0, cum_batches=0;
-
-        float resume_loss = 0;
-        bool resuming = false;
-        if (do_resume) {
-            resuming = load_checkpoint(CKPT_PATH, &start_step, &total_steps, &lr, &resume_loss,
-                &cum_compile, &cum_train, &cum_wall, &cum_steps, &cum_batches, &adam_t,
-                lw, la, rms_final, &arms_final, embed, &aembed);
-            if (resuming) printf("[RESUMED step %d, loss=%.4f]\n", start_step, resume_loss);
-        }
-        if (!resuming) {
-            printf("=== ANE Training: Stories110M (12 layers) ===\n");
-            printf("dim=%d hidden=%d heads=%d seq=%d vocab=%d layers=%d\n", DIM, HIDDEN, HEADS, SEQ, VOCAB, NLAYERS);
-            if (!load_pretrained(lw, rms_final, embed, MODEL_PATH)) {
-                printf("Pretrained load failed, using random init\n");
-                srand48(42);
-                float scale_d=1.0f/sqrtf(DIM), scale_h=1.0f/sqrtf(HIDDEN);
-                for (int L=0; L<NLAYERS; L++) {
-                    for(size_t i=0;i<WQ_SZ;i++){lw[L].Wq[i]=scale_d*(2*drand48()-1);lw[L].Wk[i]=scale_d*(2*drand48()-1);}
-                    for(size_t i=0;i<WQ_SZ;i++){lw[L].Wv[i]=scale_d*(2*drand48()-1);lw[L].Wo[i]=scale_d*(2*drand48()-1);}
-                    for(size_t i=0;i<W1_SZ;i++) lw[L].W1[i]=scale_h*(2*drand48()-1);
-                    for(size_t i=0;i<W2_SZ;i++) lw[L].W2[i]=scale_d*(2*drand48()-1);
-                    for(size_t i=0;i<W3_SZ;i++) lw[L].W3[i]=scale_h*(2*drand48()-1);
-                    for(int i=0;i<DIM;i++){lw[L].rms_att[i]=1.0f; lw[L].rms_ffn[i]=1.0f;}
-                }
-                for(int i=0;i<DIM;i++) rms_final[i]=1.0f;
-                float escale = 0.02f;
-                for(size_t i=0;i<(size_t)VOCAB*DIM;i++) embed[i]=escale*(2*drand48()-1);
-            }
-            size_t tp = (size_t)NLAYERS*LAYER_PARAMS + DIM + (size_t)VOCAB*DIM;
-            double xfmr_params = (double)NLAYERS*LAYER_PARAMS;
-            double embed_params = (double)VOCAB*DIM;
-            printf("Params: %.2fM (transformer %.2fM + embed %.2fM)\n", tp/1e6, xfmr_params/1e6, embed_params/1e6);
-            printf("Kernels: %d (%d weight-bearing + %d static sdpaBwd2)\n",
-                   TOTAL_WEIGHT_KERNELS+NLAYERS, TOTAL_WEIGHT_KERNELS, NLAYERS);
-            printf("Accum %d steps per recompile | Adam LR=%.1e b1=%.1f b2=%.3f\n", ACCUM_STEPS, lr, adam_b1, adam_b2);
-            double fwd_f = NLAYERS*(4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
-            double bwd_dx_f = fwd_f, bwd_dw_f = fwd_f;
-            double sdpa_f = NLAYERS*2.0*HEADS*5*SEQ*SEQ*HD;
-            double cls_f = 2.0*VOCAB*DIM*SEQ;
-            double total_f = fwd_f + bwd_dx_f + bwd_dw_f + sdpa_f + cls_f*3;
-            double ane_f = fwd_f + bwd_dx_f + sdpa_f;
-            printf("FLOPs/step: fwd=%.0fM bwd_dx=%.0fM bwd_dW=%.0fM sdpa_bwd=%.0fM total=%.0fM\n",
-                   fwd_f/1e6, bwd_dx_f/1e6, bwd_dw_f/1e6, sdpa_f/1e6, total_f/1e6);
-            printf("ANE FLOPs/step: %.0fM (fwd+bwd_dx+sdpa_bwd) | CPU: dW+cls (cblas)\n\n", ane_f/1e6);
-        }
-
-        // mmap token data
-        int data_fd = open(DATA_PATH, O_RDONLY);
-        if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
-        struct stat st; fstat(data_fd, &st);
-        size_t data_len = st.st_size;
-        uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
-        if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
-        size_t n_tokens = data_len / 2;
-        printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
-
-        // Gradient buffers shared across layers (reused each step)
-        float *dy = (float*)malloc(SEQ*DIM*4);            // gradient flowing backward
-        float *dffn = (float*)malloc(SEQ*DIM*4);
-        float *dh1 = (float*)malloc(SEQ*HIDDEN*4);
-        float *dh3 = (float*)malloc(SEQ*HIDDEN*4);
-        float *dx_ffn = (float*)malloc(SEQ*DIM*4);
-        float *dx2 = (float*)malloc(SEQ*DIM*4);
-        float *do_out_buf = (float*)malloc(SEQ*DIM*4);
-        float *dq = (float*)malloc(SEQ*DIM*4);
-        float *dk = (float*)malloc(SEQ*DIM*4);
-        float *dv = (float*)malloc(SEQ*DIM*4);
-        float *dx_attn = (float*)malloc(SEQ*DIM*4);
-
-        // x buffer for input to each layer (channel-first [DIM, SEQ])
-        float *x_cur = (float*)malloc(SEQ*DIM*4);
-        float *x_final = (float*)malloc(SEQ*DIM*4);     // after final rmsnorm
-        float *logits = (float*)malloc(SEQ*VOCAB*4);     // [VOCAB, SEQ] for cross-entropy
-        float *dlogits = (float*)malloc(SEQ*VOCAB*4);
-
-        // Compile static sdpaBwd2 kernels (no weights, one per layer)
-        Kern *sdpaBwd2[NLAYERS];
-        for (int L=0; L<NLAYERS; L++) {
-            sdpaBwd2[L] = compile_sdpa_bwd2();
-            if (!sdpaBwd2[L]) { printf("sdpaBwd2 compile failed\n"); return 1; }
-        }
-
-        dispatch_queue_t dw_q = dispatch_queue_create("dw_cblas", DISPATCH_QUEUE_SERIAL);
-        dispatch_group_t dw_grp = dispatch_group_create();
-
-        float last_loss = 999.0f;
-        double total_compile_ms=0, total_train_ms=0;
-        int total_steps_done=0, total_batches=0;
-        uint64_t t_wall_start = mach_absolute_time();
-
-        srand48(42 + start_step);
-
-        int step = start_step;
-        while (step < total_steps) {
-            // Check compile budget
-            if (g_compile_count + TOTAL_WEIGHT_KERNELS > MAX_COMPILES) {
-                for (int L=0; L<NLAYERS; L++) { free_layer_kernels(&kern[L]); free_kern(sdpaBwd2[L]); }
-                double wall = tb_ms(mach_absolute_time() - t_wall_start);
-                save_checkpoint(CKPT_PATH, step, total_steps, lr, last_loss,
-                    total_compile_ms+cum_compile, total_train_ms+cum_train, wall+cum_wall,
-                    total_steps_done+cum_steps, total_batches+cum_batches, adam_t,
-                    lw, la, rms_final, &arms_final, embed, &aembed);
-                printf("[exec() restart step %d, %d compiles, loss=%.4f]\n", step, g_compile_count, last_loss);
-                fflush(stdout);
-                execl(argv[0], argv[0], "--resume", NULL);
-                perror("execl"); return 1;
-            }
-
-            // Compile all layers' weight-bearing kernels
-            uint64_t tc = mach_absolute_time();
-            for (int L=0; L<NLAYERS; L++) free_layer_kernels(&kern[L]);
-
-            bool compile_ok = true;
-            for (int L=0; L<NLAYERS; L++) {
-                printf("  Compiling layer %d/%d... (%d compiles)\r", L+1, NLAYERS, g_compile_count);
-                fflush(stdout);
-                if (!compile_layer_kernels(&kern[L], &lw[L])) {
-                    printf("\nCompile failed at layer %d, restart\n", L);
-                    compile_ok = false; break;
-                }
-            }
-            if (!compile_ok) { g_compile_count = MAX_COMPILES; continue; }
-
-            // Re-compile sdpaBwd2 if needed (after exec restart)
-            for (int L=0; L<NLAYERS; L++) {
-                if (!sdpaBwd2[L]) {
-                    sdpaBwd2[L] = compile_sdpa_bwd2();
-                    if (!sdpaBwd2[L]) { printf("sdpaBwd2 recompile failed\n"); return 1; }
-                }
-            }
-
-            double cms = tb_ms(mach_absolute_time() - tc);
-            total_compile_ms += cms;
-            printf("  Compiled %d kernels in %.0fms                    \n", TOTAL_WEIGHT_KERNELS, cms);
-
-            // Zero gradient accumulators
-            for (int L=0; L<NLAYERS; L++) layer_grads_zero(&grads[L]);
-            memset(grms_final, 0, DIM*4);
-            memset(gembed, 0, (size_t)VOCAB*DIM*4);
-
-            int steps_batch = 0;
-            uint64_t tt = mach_absolute_time();
-            double t_ane=0,t_io=0,t_elem=0,t_rms=0,t_cblas_wait=0,t_cls=0;
-
-            for (int a=0; a<ACCUM_STEPS && step<total_steps; a++, step++) {
-                uint64_t t0,t1;
-                // Sample random position in token data
-                size_t max_pos = n_tokens - SEQ - 1;
-                size_t pos = (size_t)(drand48() * max_pos);
-                uint16_t *input_tokens = token_data + pos;
-                uint16_t *target_tokens = token_data + pos + 1;
-
-                // Embedding lookup → x_cur [DIM, SEQ] channel-first
-                t0=mach_absolute_time();
-                embed_lookup(x_cur, embed, input_tokens, DIM, SEQ);
-                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
-
-                // ===== FORWARD (12 layers) =====
-                for (int L=0; L<NLAYERS; L++) {
-                    LayerActs *ac = &acts[L];
-
-                    // Save layer input for rmsnorm1 backward
-                    memcpy(ac->layer_in, x_cur, SEQ*DIM*4);
-                    // Attention forward: x_cur → o_out,Q,K,V,attn_out,xnorm
-                    t0=mach_absolute_time();
-                    dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
-                    t1=mach_absolute_time(); t_cblas_wait+=tb_ms(t1-t0); t0=t1;
-                    io_write_fp16(kern[L].fwdAttn->ioIn, x_cur, DIM, SEQ);
-                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                    ane_eval(kern[L].fwdAttn);
-                    t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
-                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->o_out,    0,     DIM, SEQ);
-                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->attn_out, 4*DIM, DIM, SEQ);
-                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->xnorm,    5*DIM, DIM, SEQ);
-                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-
-                    vDSP_vadd(x_cur, 1, ac->o_out, 1, ac->x2, 1, (vDSP_Length)(SEQ*DIM));
-                    t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
-
-                    // FFN forward
-                    io_write_fp16(kern[L].fwdFFN->ioIn, ac->x2, DIM, SEQ);
-                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-                    ane_eval(kern[L].fwdFFN);
-                    t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
-                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->ffn_out,  0,              DIM,    SEQ);
-                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->h1,       DIM,            HIDDEN, SEQ);
-                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->h3,       DIM+HIDDEN,     HIDDEN, SEQ);
-                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->silu_out, DIM+2*HIDDEN,   HIDDEN, SEQ);
-                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->x2norm,   DIM+3*HIDDEN,   DIM,    SEQ);
-                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
-
-                    vDSP_vadd(ac->x2, 1, ac->ffn_out, 1, x_cur, 1, (vDSP_Length)(SEQ*DIM));
-                    t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
-                }
-
-                // Final RMSNorm (CPU)
-                t0=mach_absolute_time();
-                rmsnorm(x_final, x_cur, rms_final, DIM, SEQ);
-                t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0); t0=t1;
-
-                // Classifier: logits = embed^T @ x_final
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                            VOCAB, SEQ, DIM, 1.0f,
-                            embed, DIM, x_final, SEQ, 0.0f, logits, SEQ);
-                t1=mach_absolute_time(); t_cls+=tb_ms(t1-t0); t0=t1;
-
-                // Cross-entropy loss
-                float loss = cross_entropy_loss(dlogits, logits, target_tokens, VOCAB, SEQ);
-                last_loss = loss;
-                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
-
-                // ===== BACKWARD =====
-                // dlogits already computed by cross_entropy_loss
-
-                // Classifier backward: dx_final = embed^T @ dlogits, dembed += dlogits @ x_final^T
-                // dx_final[DIM,SEQ] = embed^T[DIM,VOCAB] @ dlogits[VOCAB,SEQ]
-                cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                            DIM, SEQ, VOCAB, 1.0f,
-                            embed, DIM, dlogits, SEQ, 0.0f, dy, SEQ);
-
-                // dembed[VOCAB,DIM] += dlogits[VOCAB,SEQ] @ x_final^T[SEQ,DIM]
-                dispatch_group_async(dw_grp, dw_q, ^{
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                                VOCAB, DIM, SEQ, 1.0f,
-                                dlogits, SEQ, x_final, SEQ, 1.0f, gembed, DIM);
-                });
-
-                // Final RMSNorm backward
-                float *dx_rms_final = (float*)calloc(SEQ*DIM, 4);
-                rmsnorm_bwd(dx_rms_final, grms_final, dy, x_cur, rms_final, DIM, SEQ);
-                memcpy(dy, dx_rms_final, SEQ*DIM*4);
-                free(dx_rms_final);
-
-                // ===== BACKWARD (12 layers, reverse) =====
-                for (int L=NLAYERS-1; L>=0; L--) {
-                    LayerActs *ac = &acts[L];
-                    LayerGrads *gr = &grads[L];
-
-                    // dy is the gradient at the output of this layer
-                    // dffn = dy (residual connection: d(x2 + ffn) = dy for both)
-                    memcpy(dffn, dy, SEQ*DIM*4);
-
-                    // FFN backward (ANE)
-                    io_write_fp16_at(kern[L].ffnBwd->ioIn, 0, dffn, DIM, SEQ);
-                    io_copy(kern[L].ffnBwd->ioIn, DIM, kern[L].fwdFFN->ioOut, DIM, 2*HIDDEN, SEQ);
-                    ane_eval(kern[L].ffnBwd);
-                    io_read_fp16(kern[L].ffnBwd->ioOut, dx_ffn, 0,           DIM,    SEQ);
-                    io_read_fp16(kern[L].ffnBwd->ioOut, dh1,    DIM,         HIDDEN, SEQ);
-                    io_read_fp16(kern[L].ffnBwd->ioOut, dh3,    DIM+HIDDEN,  HIDDEN, SEQ);
-
-                    // dW FFN async
-                    float *capt_dffn = (float*)malloc(SEQ*DIM*4); memcpy(capt_dffn, dffn, SEQ*DIM*4);
-                    float *capt_silu = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_silu, ac->silu_out, SEQ*HIDDEN*4);
-                    float *capt_dh1 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh1, dh1, SEQ*HIDDEN*4);
-                    float *capt_dh3 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh3, dh3, SEQ*HIDDEN*4);
-                    float *capt_x2n = (float*)malloc(SEQ*DIM*4); memcpy(capt_x2n, ac->x2norm, SEQ*DIM*4);
-                    dispatch_group_async(dw_grp, dw_q, ^{
-                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, HIDDEN, SEQ,
-                                    1.0f, capt_dffn, SEQ, capt_silu, SEQ, 1.0f, gr->W2, HIDDEN);
-                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
-                                    1.0f, capt_dh1, SEQ, capt_x2n, SEQ, 1.0f, gr->W1, DIM);
-                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
-                                    1.0f, capt_dh3, SEQ, capt_x2n, SEQ, 1.0f, gr->W3, DIM);
-                        free(capt_dffn); free(capt_silu); free(capt_dh1); free(capt_dh3); free(capt_x2n);
-                    });
-
-                    // RMSNorm2 backward
-                    memset(dx2, 0, SEQ*DIM*4);
-                    rmsnorm_bwd(dx2, gr->rms_ffn, dx_ffn, ac->x2, lw[L].rms_ffn, DIM, SEQ);
-                    // Add residual: dx2 += dy (from skip connection)
-                    for(int i=0;i<SEQ*DIM;i++) dx2[i] += dy[i];
-
-                    // dWo async
-                    memcpy(do_out_buf, dx2, SEQ*DIM*4);
-                    float *capt_do = (float*)malloc(SEQ*DIM*4); memcpy(capt_do, do_out_buf, SEQ*DIM*4);
-                    float *capt_attn = (float*)malloc(SEQ*DIM*4); memcpy(capt_attn, ac->attn_out, SEQ*DIM*4);
-                    dispatch_group_async(dw_grp, dw_q, ^{
-                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                    1.0f, capt_do, SEQ, capt_attn, SEQ, 1.0f, gr->Wo, DIM);
-                        free(capt_do); free(capt_attn);
-                    });
-
-                    // SDPA backward (ANE)
-                    io_copy(kern[L].sdpaBwd1->ioIn, 0, kern[L].fwdAttn->ioOut, DIM, 3*DIM, SEQ);
-                    io_write_fp16_at(kern[L].sdpaBwd1->ioIn, 3*DIM, dx2, DIM, SEQ);
-                    ane_eval(kern[L].sdpaBwd1);
-                    io_copy(sdpaBwd2[L]->ioIn, 0, kern[L].sdpaBwd1->ioOut, DIM, 2*SCORE_CH, SEQ);
-                    io_copy(sdpaBwd2[L]->ioIn, 2*SCORE_CH, kern[L].fwdAttn->ioOut, DIM, 2*DIM, SEQ);
-                    ane_eval(sdpaBwd2[L]);
-
-                    io_read_fp16(sdpaBwd2[L]->ioOut, dq, 0,   DIM, SEQ);
-                    io_read_fp16(sdpaBwd2[L]->ioOut, dk, DIM,  DIM, SEQ);
-                    io_read_fp16(kern[L].sdpaBwd1->ioOut, dv, 0, DIM, SEQ);
-
-                    // dWq/dWk/dWv async
-                    float *capt_dq = (float*)malloc(SEQ*DIM*4); memcpy(capt_dq, dq, SEQ*DIM*4);
-                    float *capt_dk = (float*)malloc(SEQ*DIM*4); memcpy(capt_dk, dk, SEQ*DIM*4);
-                    float *capt_dv = (float*)malloc(SEQ*DIM*4); memcpy(capt_dv, dv, SEQ*DIM*4);
-                    float *capt_xn = (float*)malloc(SEQ*DIM*4); memcpy(capt_xn, ac->xnorm, SEQ*DIM*4);
-                    dispatch_group_async(dw_grp, dw_q, ^{
-                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                    1.0f, capt_dq, SEQ, capt_xn, SEQ, 1.0f, gr->Wq, DIM);
-                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                    1.0f, capt_dk, SEQ, capt_xn, SEQ, 1.0f, gr->Wk, DIM);
-                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
-                                    1.0f, capt_dv, SEQ, capt_xn, SEQ, 1.0f, gr->Wv, DIM);
-                        free(capt_dq); free(capt_dk); free(capt_dv); free(capt_xn);
-                    });
-
-                    // QKV backward (ANE)
-                    io_copy(kern[L].qkvBwd->ioIn, 0, sdpaBwd2[L]->ioOut, 0, 2*DIM, SEQ);
-                    io_copy(kern[L].qkvBwd->ioIn, 2*DIM, kern[L].sdpaBwd1->ioOut, 0, DIM, SEQ);
-                    ane_eval(kern[L].qkvBwd);
-                    io_read_fp16(kern[L].qkvBwd->ioOut, dx_attn, 0, DIM, SEQ);
-
-                    // RMSNorm1 backward (using saved layer input)
-                    float *dx_rms1 = (float*)calloc(SEQ*DIM, 4);
-                    rmsnorm_bwd(dx_rms1, gr->rms_att, dx_attn, ac->layer_in, lw[L].rms_att, DIM, SEQ);
-
-                    // dy for next layer (going backward) = dx_rms1 + dx2 residual
-                    // Actually: layer output = layer_input + o_out, and x2 = layer_input + o_out
-                    // So dx(layer_input) = dx_attn_rmsnorm + dx2 (residual from attn skip)
-                    // Wait, dx2 already includes the attn skip residual gradient.
-                    // dy = dx_rms1 (through rmsnorm1) is the gradient to the layer input
-                    // But there's also the skip connection: layer_input → x2 directly
-                    // So total gradient to layer_input = dx_rms1 + dx2_skip
-                    // dx2 was computed as rmsnorm2_bwd + dy(ffn_skip), which already flows to x2
-                    // x2 = layer_input + o_out, so d(layer_input) from x2 path = dx2
-                    // And d(layer_input) from attn path through rmsnorm1 = dx_rms1
-                    // Total: dy_prev = dx_rms1 (attn rmsnorm path)
-                    // Wait no - dx2 = d(loss)/d(x2), not d(loss)/d(layer_input)
-                    // d(layer_input) = d(loss)/d(x2) * d(x2)/d(layer_input) = dx2 (since x2 = input + o_out, d(x2)/d(input) = 1)
-                    // Plus the path through rmsnorm1: dx_rms1
-                    // Hmm but dx2 was already used as input to SDPA backward... let me reconsider.
-                    //
-                    // Actually the gradient flow is:
-                    //   dy → split to (dffn, dy_skip)  [dy_skip = dy due to residual]
-                    //   dffn → ffnBwd → dx_ffn
-                    //   dx_ffn → rmsnorm2_bwd → dx_rms2
-                    //   dx2 = dx_rms2 + dy  (skip connection from residual x2 → output)
-                    //   dx2 → sdpaBwd → dx_attn through Wo^T
-                    //   dx_attn → qkvBwd → dx_qkv
-                    //   dx_qkv → rmsnorm1_bwd → dx_rms1
-                    //   dy_prev_layer = dx_rms1 + dx2  (skip connection input → x2)
-                    //
-                    // So: dy for previous layer = dx_rms1 + dx2
-                    for(int i=0;i<SEQ*DIM;i++) dy[i] = dx_rms1[i] + dx2[i];
-                    free(dx_rms1);
-                }
-
-                // Embedding backward
-                dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
-                embed_backward(gembed, dy, input_tokens, DIM, SEQ);
-
-                steps_batch++;
-                if (step % 10 == 0 || step == start_step)
-                    printf("step %-4d loss=%.4f\n", step, loss);
-
-                // JSON telemetry to stderr
-                double step_ane = t_ane/steps_batch, step_io = t_io/steps_batch;
-                double step_cls = t_cls/steps_batch, step_elem = t_elem/steps_batch;
-                double step_rms = t_rms/steps_batch, step_cbw = t_cblas_wait/steps_batch;
-                fprintf(stderr, "{\"type\":\"step\",\"step\":%d,\"loss\":%.6f,"
-                    "\"t_ane\":%.3f,\"t_io\":%.3f,\"t_cls\":%.3f,"
-                    "\"t_elem\":%.3f,\"t_rms\":%.3f,\"t_cblas_wait\":%.3f,"
-                    "\"compiles\":%d}\n",
-                    step, loss, step_ane, step_io, step_cls, step_elem, step_rms, step_cbw, g_compile_count);
-            }
-            double tms = tb_ms(mach_absolute_time() - tt);
-            total_train_ms += tms;
-            total_steps_done += steps_batch;
-            total_batches++;
-
-            // Ensure all async dW finished
-            dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
-
-            // Adam update (scale gradients by 1/steps_batch)
-            float gsc = 1.0f / steps_batch;
-            adam_t++;
-            for (int L=0; L<NLAYERS; L++) {
-                LayerGrads *g = &grads[L];
-                for(size_t i=0;i<WQ_SZ;i++){g->Wq[i]*=gsc;g->Wk[i]*=gsc;g->Wv[i]*=gsc;g->Wo[i]*=gsc;}
-                for(size_t i=0;i<W1_SZ;i++) g->W1[i]*=gsc;
-                for(size_t i=0;i<W2_SZ;i++) g->W2[i]*=gsc;
-                for(size_t i=0;i<W3_SZ;i++) g->W3[i]*=gsc;
-                for(int i=0;i<DIM;i++){g->rms_att[i]*=gsc; g->rms_ffn[i]*=gsc;}
-
-                adam_update(lw[L].Wq, g->Wq, &la[L].Wq, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].Wk, g->Wk, &la[L].Wk, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].Wv, g->Wv, &la[L].Wv, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].Wo, g->Wo, &la[L].Wo, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].W1, g->W1, &la[L].W1, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].W2, g->W2, &la[L].W2, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].W3, g->W3, &la[L].W3, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].rms_att, g->rms_att, &la[L].rms_att, adam_t, lr, adam_b1, adam_b2, adam_eps);
-                adam_update(lw[L].rms_ffn, g->rms_ffn, &la[L].rms_ffn, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            }
-            for(int i=0;i<DIM;i++) grms_final[i]*=gsc;
-            adam_update(rms_final, grms_final, &arms_final, adam_t, lr, adam_b1, adam_b2, adam_eps);
-            // Scale and update embed
-            for(size_t i=0;i<(size_t)VOCAB*DIM;i++) gembed[i]*=gsc;
-            adam_update(embed, gembed, &aembed, adam_t, lr, adam_b1, adam_b2, adam_eps);
-
-            printf("  [batch %d: compile=%.0fms train=%.1fms (%.1fms/step) compiles=%d]\n",
-                   steps_batch, cms, tms, tms/steps_batch, g_compile_count);
-            printf("    ane=%.1f io=%.1f cls=%.1f elem=%.1f rms=%.1f cblas_wait=%.1f ms/step\n",
-                   t_ane/steps_batch, t_io/steps_batch, t_cls/steps_batch, t_elem/steps_batch,
-                   t_rms/steps_batch, t_cblas_wait/steps_batch);
-
-            // JSON batch telemetry to stderr
-            {
-                double bf = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
-                double bs = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD;
-                double ane_f_batch = (bf*2 + bs) * steps_batch;
-                double ane_tflops = ane_f_batch / (tms * 1e9);
-                fprintf(stderr, "{\"type\":\"batch\",\"batch\":%d,\"compile_ms\":%.1f,"
-                    "\"train_ms\":%.1f,\"ms_per_step\":%.1f}\n",
-                    steps_batch, cms, tms, tms/steps_batch);
-                fprintf(stderr, "{\"type\":\"perf\",\"ane_tflops\":%.3f,\"ane_util_pct\":%.2f}\n",
-                    ane_tflops, 100.0*ane_tflops/15.8);
-            }
-        }
-
-        // Efficiency report
-        double wall = tb_ms(mach_absolute_time() - t_wall_start);
-        total_compile_ms += cum_compile; total_train_ms += cum_train;
-        wall += cum_wall; total_steps_done += cum_steps; total_batches += cum_batches;
-        double fwd_flops = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
-        double sdpa_flops = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD;
-        double cls_flops = 2.0*VOCAB*DIM*SEQ;
-        double total_flops = (fwd_flops*3 + sdpa_flops + cls_flops*3) * total_steps_done;
-        double ane_flops = (fwd_flops*2 + sdpa_flops) * total_steps_done;
-        printf("\n=== Efficiency Report ===\n");
-        printf("Total steps:     %d\n", total_steps_done);
-        printf("Wall time:       %.0f ms (%.1f s)\n", wall, wall/1000);
-        printf("Compile time:    %.0f ms (%.1f%%)\n", total_compile_ms, 100*total_compile_ms/wall);
-        printf("Train time:      %.0f ms (%.1f%%)\n", total_train_ms, 100*total_train_ms/wall);
-        printf("Avg train:       %.1f ms/step\n", total_train_ms/total_steps_done);
-        printf("ANE TFLOPS:      %.2f sustained\n", ane_flops / (total_train_ms * 1e9));
-        printf("Total TFLOPS:    %.2f (ANE+CPU)\n", total_flops / (total_train_ms * 1e9));
-        printf("ANE utilization: %.1f%% of 15.8 TFLOPS\n", 100*ane_flops/(total_train_ms*1e9)/15.8);
-
-        // Cleanup
-        for (int L=0; L<NLAYERS; L++) {
-            free_layer_kernels(&kern[L]);
-            free_kern(sdpaBwd2[L]);
-            layer_weights_free(&lw[L]);
-            layer_adam_free(&la[L]);
-            layer_acts_free(&acts[L]);
-            layer_grads_free(&grads[L]);
-        }
-        munmap(token_data, data_len);
-        close(data_fd);
-        free(rms_final); free(embed); free(grms_final); free(gembed);
-        adam_free(&arms_final); adam_free(&aembed);
-        free(dy); free(dffn); free(dh1); free(dh3); free(dx_ffn); free(dx2);
-        free(do_out_buf); free(dq); free(dk); free(dv); free(dx_attn);
-        free(x_cur); free(x_final); free(logits); free(dlogits);
-    }
-    return 0;
-}
+// train_large.m — Train stories110M (12 layers, 768dim, 3072hidden) on ANE
+// Uses pretokenized TinyStories data with cross-entropy loss
+// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch
+#include "stories_io.h"
+#include "stories_mil.h"
+#include "stories_cpu_ops.h"
+#include "ane_compat.h"
+
+#define CKPT_PATH "ane_stories110M_ckpt.bin"
+#define MODEL_PATH "../../assets/models/stories110M.bin"
+#define DATA_PATH "tinystories_data00.bin"
+
+// ===== Weight loading from llama2.c format =====
+static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) {
+    FILE *f = fopen(path, "rb");
+    if (!f) { printf("Cannot open %s\n", path); return false; }
+    Llama2Config cfg;
+    fread(&cfg, sizeof(cfg), 1, f);
+    printf("  Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n",
+           cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len);
+    if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) {
+        printf("  ERROR: Config mismatch! Expected dim=%d hidden=%d layers=%d\n", DIM, HIDDEN, NLAYERS);
+        fclose(f); return false;
+    }
+    int V = abs(cfg.vocab_size);
+    bool shared = cfg.vocab_size > 0;
+
+    // Read in llama2.c order: embed, rms_att[all], wq[all], wk[all], wv[all], wo[all],
+    //                         rms_ffn[all], w1[all], w2[all], w3[all], rms_final, [wcls]
+    fread(embed, 4, V * DIM, f);
+
+    // rms_att weights for all layers (contiguous)
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f);
+    // wq for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f);
+    // wk for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f);
+    // wv for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f);
+    // wo for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f);
+    // rms_ffn weights for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f);
+    // w1 for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f);
+    // w2 for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f);
+    // w3 for all layers
+    for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f);
+    // rms_final
+    fread(rms_final, 4, DIM, f);
+    // wcls = embed if shared (we just use embed pointer)
+
+    fclose(f);
+    printf("  Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls");
+    return true;
+}
+
+// ===== Compile one layer's kernels =====
+static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) {
+    lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{
+        @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)},
+        @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)},
+        @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)},
+        @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)},
+        @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)},
+        @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
+    }), DIM*SEQ*2, 6*DIM*SEQ*2);
+
+    lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{
+        @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)},
+        @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)},
+        @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)},
+        @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)},
+    }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2);
+
+    lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{
+        @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)},
+        @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)},
+        @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)},
+    }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2);
+
+    lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{
+        @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()},
+        @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)},
+    }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2);
+
+    lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{
+        @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)},
+        @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)},
+        @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)},
+    }), 3*DIM*SEQ*2, DIM*SEQ*2);
+
+    return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd;
+}
+
+// Compile weight-free sdpaBwd2 (only needs once, no weights)
+static Kern *compile_sdpa_bwd2(void) {
+    return compile_kern_mil_w(gen_sdpa_bwd2(), @{},
+        (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2);
+}
+
+static void free_layer_kernels(LayerKernels *lk) {
+    free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd);
+    free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd);
+    // sdpaBwd2 is shared, freed separately
+    lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL;
+}
+
+// ===== Checkpoint save/load =====
+static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss,
+                            double cc, double ct, double cw, int cs, int cb, int adam_t,
+                            LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
+                            float *embed, AdamState *aembed) {
+    FILE *f = fopen(path, "wb");
+    CkptHdr h = {0};
+    h.magic = 0x424C5A54; h.version = 2;
+    h.step = step; h.total_steps = total_steps;
+    h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM;
+    h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ;
+    h.lr = lr; h.loss = loss;
+    h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw;
+    h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t;
+    fwrite(&h, sizeof(h), 1, f);
+    // Per-layer weights + adam
+    for (int L = 0; L < NLAYERS; L++) {
+        fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f);
+        fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f);
+        fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f);
+        fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f);
+        // Adam state
+        fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f);
+        fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f);
+        fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f);
+        fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f);
+        fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f);
+        fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f);
+        fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f);
+        fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f);
+        fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f);
+    }
+    fwrite(rms_final,4,DIM,f);
+    fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f);
+    fwrite(embed,4,VOCAB*DIM,f);
+    fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f);
+    fclose(f);
+}
+
+static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss,
+                             double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t,
+                             LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final,
+                             float *embed, AdamState *aembed) {
+    FILE *f = fopen(path, "rb");
+    if (!f) return false;
+    CkptHdr h;
+    fread(&h, sizeof(h), 1, f);
+    if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; }
+    *step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss;
+    *cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall;
+    *cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t;
+    for (int L = 0; L < NLAYERS; L++) {
+        fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f);
+        fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f);
+        fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f);
+        fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f);
+        fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f);
+        fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f);
+        fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f);
+        fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f);
+        fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f);
+        fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f);
+        fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f);
+        fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f);
+        fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f);
+    }
+    fread(rms_final,4,DIM,f);
+    fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f);
+    fread(embed,4,VOCAB*DIM,f);
+    fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f);
+    fclose(f);
+    return true;
+}
+
+// ===== Main =====
+int main(int argc, char *argv[]) {
+    @autoreleasepool {
+        setbuf(stdout, NULL);
+        ane_init();
+        ane_detect_platform();
+        ane_print_platform();
+        mach_timebase_info(&g_tb);
+
+        int total_steps = 10000;
+        float lr = 3e-4f;
+        float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f;
+        int adam_t = 0, start_step = 0;
+
+        // Parse args
+        bool do_resume = false;
+        for (int i=1; i<argc; i++) {
+            if (strcmp(argv[i], "--resume") == 0) do_resume = true;
+            else if (strcmp(argv[i], "--steps") == 0 && i+1<argc) total_steps = atoi(argv[++i]);
+            else if (strcmp(argv[i], "--lr") == 0 && i+1<argc) lr = atof(argv[++i]);
+        }
+
+        // Allocate per-layer state
+        LayerWeights lw[NLAYERS];
+        LayerAdam la[NLAYERS];
+        LayerActs acts[NLAYERS];
+        LayerGrads grads[NLAYERS];
+        LayerKernels kern[NLAYERS];
+        for (int L=0; L<NLAYERS; L++) {
+            lw[L] = layer_weights_alloc();
+            la[L] = layer_adam_alloc();
+            acts[L] = layer_acts_alloc();
+            grads[L] = layer_grads_alloc();
+            memset(&kern[L], 0, sizeof(LayerKernels));
+        }
+
+        // Final RMSNorm + embedding + classifier
+        float *rms_final = (float*)malloc(DIM*4);
+        float *embed = (float*)malloc(VOCAB*DIM*4);  // [VOCAB, DIM] row-major
+        float *grms_final = (float*)calloc(DIM, 4);
+        float *gembed = (float*)calloc(VOCAB*DIM, 4);
+        AdamState arms_final = adam_alloc(DIM);
+        AdamState aembed = adam_alloc((size_t)VOCAB*DIM);
+
+        double cum_compile=0, cum_train=0, cum_wall=0;
+        int cum_steps=0, cum_batches=0;
+
+        float resume_loss = 0;
+        bool resuming = false;
+        if (do_resume) {
+            resuming = load_checkpoint(CKPT_PATH, &start_step, &total_steps, &lr, &resume_loss,
+                &cum_compile, &cum_train, &cum_wall, &cum_steps, &cum_batches, &adam_t,
+                lw, la, rms_final, &arms_final, embed, &aembed);
+            if (resuming) printf("[RESUMED step %d, loss=%.4f]\n", start_step, resume_loss);
+        }
+        if (!resuming) {
+            printf("=== ANE Training: Stories110M (12 layers) ===\n");
+            printf("dim=%d hidden=%d heads=%d seq=%d vocab=%d layers=%d\n", DIM, HIDDEN, HEADS, SEQ, VOCAB, NLAYERS);
+            if (!load_pretrained(lw, rms_final, embed, MODEL_PATH)) {
+                printf("Pretrained load failed, using random init\n");
+                srand48(42);
+                float scale_d=1.0f/sqrtf(DIM), scale_h=1.0f/sqrtf(HIDDEN);
+                for (int L=0; L<NLAYERS; L++) {
+                    for(size_t i=0;i<WQ_SZ;i++){lw[L].Wq[i]=scale_d*(2*drand48()-1);lw[L].Wk[i]=scale_d*(2*drand48()-1);}
+                    for(size_t i=0;i<WQ_SZ;i++){lw[L].Wv[i]=scale_d*(2*drand48()-1);lw[L].Wo[i]=scale_d*(2*drand48()-1);}
+                    for(size_t i=0;i<W1_SZ;i++) lw[L].W1[i]=scale_h*(2*drand48()-1);
+                    for(size_t i=0;i<W2_SZ;i++) lw[L].W2[i]=scale_d*(2*drand48()-1);
+                    for(size_t i=0;i<W3_SZ;i++) lw[L].W3[i]=scale_h*(2*drand48()-1);
+                    for(int i=0;i<DIM;i++){lw[L].rms_att[i]=1.0f; lw[L].rms_ffn[i]=1.0f;}
+                }
+                for(int i=0;i<DIM;i++) rms_final[i]=1.0f;
+                float escale = 0.02f;
+                for(size_t i=0;i<(size_t)VOCAB*DIM;i++) embed[i]=escale*(2*drand48()-1);
+            }
+            size_t tp = (size_t)NLAYERS*LAYER_PARAMS + DIM + (size_t)VOCAB*DIM;
+            double xfmr_params = (double)NLAYERS*LAYER_PARAMS;
+            double embed_params = (double)VOCAB*DIM;
+            printf("Params: %.2fM (transformer %.2fM + embed %.2fM)\n", tp/1e6, xfmr_params/1e6, embed_params/1e6);
+            printf("Kernels: %d (%d weight-bearing + %d static sdpaBwd2)\n",
+                   TOTAL_WEIGHT_KERNELS+NLAYERS, TOTAL_WEIGHT_KERNELS, NLAYERS);
+            printf("Accum %d steps per recompile | Adam LR=%.1e b1=%.1f b2=%.3f\n", ACCUM_STEPS, lr, adam_b1, adam_b2);
+            double fwd_f = NLAYERS*(4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
+            double bwd_dx_f = fwd_f, bwd_dw_f = fwd_f;
+            double sdpa_f = NLAYERS*2.0*HEADS*5*SEQ*SEQ*HD;
+            double cls_f = 2.0*VOCAB*DIM*SEQ;
+            double total_f = fwd_f + bwd_dx_f + bwd_dw_f + sdpa_f + cls_f*3;
+            double ane_f = fwd_f + bwd_dx_f + sdpa_f;
+            printf("FLOPs/step: fwd=%.0fM bwd_dx=%.0fM bwd_dW=%.0fM sdpa_bwd=%.0fM total=%.0fM\n",
+                   fwd_f/1e6, bwd_dx_f/1e6, bwd_dw_f/1e6, sdpa_f/1e6, total_f/1e6);
+            printf("ANE FLOPs/step: %.0fM (fwd+bwd_dx+sdpa_bwd) | CPU: dW+cls (cblas)\n\n", ane_f/1e6);
+        }
+
+        // mmap token data
+        int data_fd = open(DATA_PATH, O_RDONLY);
+        if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; }
+        struct stat st; fstat(data_fd, &st);
+        size_t data_len = st.st_size;
+        uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0);
+        if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; }
+        size_t n_tokens = data_len / 2;
+        printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6);
+
+        // Gradient buffers shared across layers (reused each step)
+        float *dy = (float*)malloc(SEQ*DIM*4);            // gradient flowing backward
+        float *dffn = (float*)malloc(SEQ*DIM*4);
+        float *dh1 = (float*)malloc(SEQ*HIDDEN*4);
+        float *dh3 = (float*)malloc(SEQ*HIDDEN*4);
+        float *dx_ffn = (float*)malloc(SEQ*DIM*4);
+        float *dx2 = (float*)malloc(SEQ*DIM*4);
+        float *do_out_buf = (float*)malloc(SEQ*DIM*4);
+        float *dq = (float*)malloc(SEQ*DIM*4);
+        float *dk = (float*)malloc(SEQ*DIM*4);
+        float *dv = (float*)malloc(SEQ*DIM*4);
+        float *dx_attn = (float*)malloc(SEQ*DIM*4);
+
+        // x buffer for input to each layer (channel-first [DIM, SEQ])
+        float *x_cur = (float*)malloc(SEQ*DIM*4);
+        float *x_final = (float*)malloc(SEQ*DIM*4);     // after final rmsnorm
+        float *logits = (float*)malloc(SEQ*VOCAB*4);     // [VOCAB, SEQ] for cross-entropy
+        float *dlogits = (float*)malloc(SEQ*VOCAB*4);
+
+        // Compile static sdpaBwd2 kernels (no weights, one per layer)
+        Kern *sdpaBwd2[NLAYERS];
+        for (int L=0; L<NLAYERS; L++) {
+            sdpaBwd2[L] = compile_sdpa_bwd2();
+            if (!sdpaBwd2[L]) { printf("sdpaBwd2 compile failed\n"); return 1; }
+        }
+
+        dispatch_queue_t dw_q = dispatch_queue_create("dw_cblas", DISPATCH_QUEUE_SERIAL);
+        dispatch_group_t dw_grp = dispatch_group_create();
+
+        float last_loss = 999.0f;
+        double total_compile_ms=0, total_train_ms=0;
+        int total_steps_done=0, total_batches=0;
+        uint64_t t_wall_start = mach_absolute_time();
+
+        srand48(42 + start_step);
+
+        int step = start_step;
+        while (step < total_steps) {
+            // Check compile budget
+            if (g_compile_count + TOTAL_WEIGHT_KERNELS > MAX_COMPILES) {
+                for (int L=0; L<NLAYERS; L++) { free_layer_kernels(&kern[L]); free_kern(sdpaBwd2[L]); }
+                double wall = tb_ms(mach_absolute_time() - t_wall_start);
+                save_checkpoint(CKPT_PATH, step, total_steps, lr, last_loss,
+                    total_compile_ms+cum_compile, total_train_ms+cum_train, wall+cum_wall,
+                    total_steps_done+cum_steps, total_batches+cum_batches, adam_t,
+                    lw, la, rms_final, &arms_final, embed, &aembed);
+                printf("[exec() restart step %d, %d compiles, loss=%.4f]\n", step, g_compile_count, last_loss);
+                fflush(stdout);
+                execl(argv[0], argv[0], "--resume", NULL);
+                perror("execl"); return 1;
+            }
+
+            // Compile all layers' weight-bearing kernels
+            uint64_t tc = mach_absolute_time();
+            for (int L=0; L<NLAYERS; L++) free_layer_kernels(&kern[L]);
+
+            bool compile_ok = true;
+            for (int L=0; L<NLAYERS; L++) {
+                printf("  Compiling layer %d/%d... (%d compiles)\r", L+1, NLAYERS, g_compile_count);
+                fflush(stdout);
+                if (!compile_layer_kernels(&kern[L], &lw[L])) {
+                    printf("\nCompile failed at layer %d, restart\n", L);
+                    compile_ok = false; break;
+                }
+            }
+            if (!compile_ok) { g_compile_count = MAX_COMPILES; continue; }
+
+            // Re-compile sdpaBwd2 if needed (after exec restart)
+            for (int L=0; L<NLAYERS; L++) {
+                if (!sdpaBwd2[L]) {
+                    sdpaBwd2[L] = compile_sdpa_bwd2();
+                    if (!sdpaBwd2[L]) { printf("sdpaBwd2 recompile failed\n"); return 1; }
+                }
+            }
+
+            double cms = tb_ms(mach_absolute_time() - tc);
+            total_compile_ms += cms;
+            printf("  Compiled %d kernels in %.0fms                    \n", TOTAL_WEIGHT_KERNELS, cms);
+
+            // Zero gradient accumulators
+            for (int L=0; L<NLAYERS; L++) layer_grads_zero(&grads[L]);
+            memset(grms_final, 0, DIM*4);
+            memset(gembed, 0, (size_t)VOCAB*DIM*4);
+
+            int steps_batch = 0;
+            uint64_t tt = mach_absolute_time();
+            double t_ane=0,t_io=0,t_elem=0,t_rms=0,t_cblas_wait=0,t_cls=0;
+
+            for (int a=0; a<ACCUM_STEPS && step<total_steps; a++, step++) {
+                uint64_t t0,t1;
+                // Sample random position in token data
+                size_t max_pos = n_tokens - SEQ - 1;
+                size_t pos = (size_t)(drand48() * max_pos);
+                uint16_t *input_tokens = token_data + pos;
+                uint16_t *target_tokens = token_data + pos + 1;
+
+                // Embedding lookup → x_cur [DIM, SEQ] channel-first
+                t0=mach_absolute_time();
+                embed_lookup(x_cur, embed, input_tokens, DIM, SEQ);
+                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
+
+                // ===== FORWARD (12 layers) =====
+                for (int L=0; L<NLAYERS; L++) {
+                    LayerActs *ac = &acts[L];
+
+                    // Save layer input for rmsnorm1 backward
+                    memcpy(ac->layer_in, x_cur, SEQ*DIM*4);
+                    // Attention forward: x_cur → o_out,Q,K,V,attn_out,xnorm
+                    t0=mach_absolute_time();
+                    dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
+                    t1=mach_absolute_time(); t_cblas_wait+=tb_ms(t1-t0); t0=t1;
+                    io_write_fp16(kern[L].fwdAttn->ioIn, x_cur, DIM, SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                    ane_eval(kern[L].fwdAttn);
+                    t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->o_out,    0,     DIM, SEQ);
+                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->attn_out, 4*DIM, DIM, SEQ);
+                    io_read_fp16(kern[L].fwdAttn->ioOut, ac->xnorm,    5*DIM, DIM, SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+
+                    vDSP_vadd(x_cur, 1, ac->o_out, 1, ac->x2, 1, (vDSP_Length)(SEQ*DIM));
+                    t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
+
+                    // FFN forward
+                    io_write_fp16(kern[L].fwdFFN->ioIn, ac->x2, DIM, SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+                    ane_eval(kern[L].fwdFFN);
+                    t1=mach_absolute_time(); t_ane+=tb_ms(t1-t0); t0=t1;
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->ffn_out,  0,              DIM,    SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->h1,       DIM,            HIDDEN, SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->h3,       DIM+HIDDEN,     HIDDEN, SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->silu_out, DIM+2*HIDDEN,   HIDDEN, SEQ);
+                    io_read_fp16(kern[L].fwdFFN->ioOut, ac->x2norm,   DIM+3*HIDDEN,   DIM,    SEQ);
+                    t1=mach_absolute_time(); t_io+=tb_ms(t1-t0); t0=t1;
+
+                    vDSP_vadd(ac->x2, 1, ac->ffn_out, 1, x_cur, 1, (vDSP_Length)(SEQ*DIM));
+                    t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0);
+                }
+
+                // Final RMSNorm (CPU)
+                t0=mach_absolute_time();
+                rmsnorm(x_final, x_cur, rms_final, DIM, SEQ);
+                t1=mach_absolute_time(); t_rms+=tb_ms(t1-t0); t0=t1;
+
+                // Classifier: logits = embed^T @ x_final
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                            VOCAB, SEQ, DIM, 1.0f,
+                            embed, DIM, x_final, SEQ, 0.0f, logits, SEQ);
+                t1=mach_absolute_time(); t_cls+=tb_ms(t1-t0); t0=t1;
+
+                // Cross-entropy loss
+                float loss = cross_entropy_loss(dlogits, logits, target_tokens, VOCAB, SEQ);
+                last_loss = loss;
+                t1=mach_absolute_time(); t_elem+=tb_ms(t1-t0); t0=t1;
+
+                // ===== BACKWARD =====
+                // dlogits already computed by cross_entropy_loss
+
+                // Classifier backward: dx_final = embed^T @ dlogits, dembed += dlogits @ x_final^T
+                // dx_final[DIM,SEQ] = embed^T[DIM,VOCAB] @ dlogits[VOCAB,SEQ]
+                cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                            DIM, SEQ, VOCAB, 1.0f,
+                            embed, DIM, dlogits, SEQ, 0.0f, dy, SEQ);
+
+                // dembed[VOCAB,DIM] += dlogits[VOCAB,SEQ] @ x_final^T[SEQ,DIM]
+                dispatch_group_async(dw_grp, dw_q, ^{
+                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                                VOCAB, DIM, SEQ, 1.0f,
+                                dlogits, SEQ, x_final, SEQ, 1.0f, gembed, DIM);
+                });
+
+                // Final RMSNorm backward
+                float *dx_rms_final = (float*)calloc(SEQ*DIM, 4);
+                rmsnorm_bwd(dx_rms_final, grms_final, dy, x_cur, rms_final, DIM, SEQ);
+                memcpy(dy, dx_rms_final, SEQ*DIM*4);
+                free(dx_rms_final);
+
+                // ===== BACKWARD (12 layers, reverse) =====
+                for (int L=NLAYERS-1; L>=0; L--) {
+                    LayerActs *ac = &acts[L];
+                    LayerGrads *gr = &grads[L];
+
+                    // dy is the gradient at the output of this layer
+                    // dffn = dy (residual connection: d(x2 + ffn) = dy for both)
+                    memcpy(dffn, dy, SEQ*DIM*4);
+
+                    // FFN backward (ANE)
+                    io_write_fp16_at(kern[L].ffnBwd->ioIn, 0, dffn, DIM, SEQ);
+                    io_copy(kern[L].ffnBwd->ioIn, DIM, kern[L].fwdFFN->ioOut, DIM, 2*HIDDEN, SEQ);
+                    ane_eval(kern[L].ffnBwd);
+                    io_read_fp16(kern[L].ffnBwd->ioOut, dx_ffn, 0,           DIM,    SEQ);
+                    io_read_fp16(kern[L].ffnBwd->ioOut, dh1,    DIM,         HIDDEN, SEQ);
+                    io_read_fp16(kern[L].ffnBwd->ioOut, dh3,    DIM+HIDDEN,  HIDDEN, SEQ);
+
+                    // dW FFN async
+                    float *capt_dffn = (float*)malloc(SEQ*DIM*4); memcpy(capt_dffn, dffn, SEQ*DIM*4);
+                    float *capt_silu = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_silu, ac->silu_out, SEQ*HIDDEN*4);
+                    float *capt_dh1 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh1, dh1, SEQ*HIDDEN*4);
+                    float *capt_dh3 = (float*)malloc(SEQ*HIDDEN*4); memcpy(capt_dh3, dh3, SEQ*HIDDEN*4);
+                    float *capt_x2n = (float*)malloc(SEQ*DIM*4); memcpy(capt_x2n, ac->x2norm, SEQ*DIM*4);
+                    dispatch_group_async(dw_grp, dw_q, ^{
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, HIDDEN, SEQ,
+                                    1.0f, capt_dffn, SEQ, capt_silu, SEQ, 1.0f, gr->W2, HIDDEN);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
+                                    1.0f, capt_dh1, SEQ, capt_x2n, SEQ, 1.0f, gr->W1, DIM);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, HIDDEN, DIM, SEQ,
+                                    1.0f, capt_dh3, SEQ, capt_x2n, SEQ, 1.0f, gr->W3, DIM);
+                        free(capt_dffn); free(capt_silu); free(capt_dh1); free(capt_dh3); free(capt_x2n);
+                    });
+
+                    // RMSNorm2 backward
+                    memset(dx2, 0, SEQ*DIM*4);
+                    rmsnorm_bwd(dx2, gr->rms_ffn, dx_ffn, ac->x2, lw[L].rms_ffn, DIM, SEQ);
+                    // Add residual: dx2 += dy (from skip connection)
+                    for(int i=0;i<SEQ*DIM;i++) dx2[i] += dy[i];
+
+                    // dWo async
+                    memcpy(do_out_buf, dx2, SEQ*DIM*4);
+                    float *capt_do = (float*)malloc(SEQ*DIM*4); memcpy(capt_do, do_out_buf, SEQ*DIM*4);
+                    float *capt_attn = (float*)malloc(SEQ*DIM*4); memcpy(capt_attn, ac->attn_out, SEQ*DIM*4);
+                    dispatch_group_async(dw_grp, dw_q, ^{
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_do, SEQ, capt_attn, SEQ, 1.0f, gr->Wo, DIM);
+                        free(capt_do); free(capt_attn);
+                    });
+
+                    // SDPA backward (ANE)
+                    io_copy(kern[L].sdpaBwd1->ioIn, 0, kern[L].fwdAttn->ioOut, DIM, 3*DIM, SEQ);
+                    io_write_fp16_at(kern[L].sdpaBwd1->ioIn, 3*DIM, dx2, DIM, SEQ);
+                    ane_eval(kern[L].sdpaBwd1);
+                    io_copy(sdpaBwd2[L]->ioIn, 0, kern[L].sdpaBwd1->ioOut, DIM, 2*SCORE_CH, SEQ);
+                    io_copy(sdpaBwd2[L]->ioIn, 2*SCORE_CH, kern[L].fwdAttn->ioOut, DIM, 2*DIM, SEQ);
+                    ane_eval(sdpaBwd2[L]);
+
+                    io_read_fp16(sdpaBwd2[L]->ioOut, dq, 0,   DIM, SEQ);
+                    io_read_fp16(sdpaBwd2[L]->ioOut, dk, DIM,  DIM, SEQ);
+                    io_read_fp16(kern[L].sdpaBwd1->ioOut, dv, 0, DIM, SEQ);
+
+                    // dWq/dWk/dWv async
+                    float *capt_dq = (float*)malloc(SEQ*DIM*4); memcpy(capt_dq, dq, SEQ*DIM*4);
+                    float *capt_dk = (float*)malloc(SEQ*DIM*4); memcpy(capt_dk, dk, SEQ*DIM*4);
+                    float *capt_dv = (float*)malloc(SEQ*DIM*4); memcpy(capt_dv, dv, SEQ*DIM*4);
+                    float *capt_xn = (float*)malloc(SEQ*DIM*4); memcpy(capt_xn, ac->xnorm, SEQ*DIM*4);
+                    dispatch_group_async(dw_grp, dw_q, ^{
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_dq, SEQ, capt_xn, SEQ, 1.0f, gr->Wq, DIM);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_dk, SEQ, capt_xn, SEQ, 1.0f, gr->Wk, DIM);
+                        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, DIM, DIM, SEQ,
+                                    1.0f, capt_dv, SEQ, capt_xn, SEQ, 1.0f, gr->Wv, DIM);
+                        free(capt_dq); free(capt_dk); free(capt_dv); free(capt_xn);
+                    });
+
+                    // QKV backward (ANE)
+                    io_copy(kern[L].qkvBwd->ioIn, 0, sdpaBwd2[L]->ioOut, 0, 2*DIM, SEQ);
+                    io_copy(kern[L].qkvBwd->ioIn, 2*DIM, kern[L].sdpaBwd1->ioOut, 0, DIM, SEQ);
+                    ane_eval(kern[L].qkvBwd);
+                    io_read_fp16(kern[L].qkvBwd->ioOut, dx_attn, 0, DIM, SEQ);
+
+                    // RMSNorm1 backward (using saved layer input)
+                    float *dx_rms1 = (float*)calloc(SEQ*DIM, 4);
+                    rmsnorm_bwd(dx_rms1, gr->rms_att, dx_attn, ac->layer_in, lw[L].rms_att, DIM, SEQ);
+
+                    // dy for next layer (going backward) = dx_rms1 + dx2 residual
+                    // Actually: layer output = layer_input + o_out, and x2 = layer_input + o_out
+                    // So dx(layer_input) = dx_attn_rmsnorm + dx2 (residual from attn skip)
+                    // Wait, dx2 already includes the attn skip residual gradient.
+                    // dy = dx_rms1 (through rmsnorm1) is the gradient to the layer input
+                    // But there's also the skip connection: layer_input → x2 directly
+                    // So total gradient to layer_input = dx_rms1 + dx2_skip
+                    // dx2 was computed as rmsnorm2_bwd + dy(ffn_skip), which already flows to x2
+                    // x2 = layer_input + o_out, so d(layer_input) from x2 path = dx2
+                    // And d(layer_input) from attn path through rmsnorm1 = dx_rms1
+                    // Total: dy_prev = dx_rms1 (attn rmsnorm path)
+                    // Wait no - dx2 = d(loss)/d(x2), not d(loss)/d(layer_input)
+                    // d(layer_input) = d(loss)/d(x2) * d(x2)/d(layer_input) = dx2 (since x2 = input + o_out, d(x2)/d(input) = 1)
+                    // Plus the path through rmsnorm1: dx_rms1
+                    // Hmm but dx2 was already used as input to SDPA backward... let me reconsider.
+                    //
+                    // Actually the gradient flow is:
+                    //   dy → split to (dffn, dy_skip)  [dy_skip = dy due to residual]
+                    //   dffn → ffnBwd → dx_ffn
+                    //   dx_ffn → rmsnorm2_bwd → dx_rms2
+                    //   dx2 = dx_rms2 + dy  (skip connection from residual x2 → output)
+                    //   dx2 → sdpaBwd → dx_attn through Wo^T
+                    //   dx_attn → qkvBwd → dx_qkv
+                    //   dx_qkv → rmsnorm1_bwd → dx_rms1
+                    //   dy_prev_layer = dx_rms1 + dx2  (skip connection input → x2)
+                    //
+                    // So: dy for previous layer = dx_rms1 + dx2
+                    for(int i=0;i<SEQ*DIM;i++) dy[i] = dx_rms1[i] + dx2[i];
+                    free(dx_rms1);
+                }
+
+                // Embedding backward
+                dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
+                embed_backward(gembed, dy, input_tokens, DIM, SEQ);
+
+                steps_batch++;
+                if (step % 10 == 0 || step == start_step)
+                    printf("step %-4d loss=%.4f\n", step, loss);
+
+                // JSON telemetry to stderr
+                double step_ane = t_ane/steps_batch, step_io = t_io/steps_batch;
+                double step_cls = t_cls/steps_batch, step_elem = t_elem/steps_batch;
+                double step_rms = t_rms/steps_batch, step_cbw = t_cblas_wait/steps_batch;
+                fprintf(stderr, "{\"type\":\"step\",\"step\":%d,\"loss\":%.6f,"
+                    "\"t_ane\":%.3f,\"t_io\":%.3f,\"t_cls\":%.3f,"
+                    "\"t_elem\":%.3f,\"t_rms\":%.3f,\"t_cblas_wait\":%.3f,"
+                    "\"compiles\":%d}\n",
+                    step, loss, step_ane, step_io, step_cls, step_elem, step_rms, step_cbw, g_compile_count);
+            }
+            double tms = tb_ms(mach_absolute_time() - tt);
+            total_train_ms += tms;
+            total_steps_done += steps_batch;
+            total_batches++;
+
+            // Ensure all async dW finished
+            dispatch_group_wait(dw_grp, DISPATCH_TIME_FOREVER);
+
+            // Adam update (scale gradients by 1/steps_batch)
+            float gsc = 1.0f / steps_batch;
+            adam_t++;
+            for (int L=0; L<NLAYERS; L++) {
+                LayerGrads *g = &grads[L];
+                for(size_t i=0;i<WQ_SZ;i++){g->Wq[i]*=gsc;g->Wk[i]*=gsc;g->Wv[i]*=gsc;g->Wo[i]*=gsc;}
+                for(size_t i=0;i<W1_SZ;i++) g->W1[i]*=gsc;
+                for(size_t i=0;i<W2_SZ;i++) g->W2[i]*=gsc;
+                for(size_t i=0;i<W3_SZ;i++) g->W3[i]*=gsc;
+                for(int i=0;i<DIM;i++){g->rms_att[i]*=gsc; g->rms_ffn[i]*=gsc;}
+
+                adam_update(lw[L].Wq, g->Wq, &la[L].Wq, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].Wk, g->Wk, &la[L].Wk, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].Wv, g->Wv, &la[L].Wv, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].Wo, g->Wo, &la[L].Wo, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].W1, g->W1, &la[L].W1, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].W2, g->W2, &la[L].W2, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].W3, g->W3, &la[L].W3, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].rms_att, g->rms_att, &la[L].rms_att, adam_t, lr, adam_b1, adam_b2, adam_eps);
+                adam_update(lw[L].rms_ffn, g->rms_ffn, &la[L].rms_ffn, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            }
+            for(int i=0;i<DIM;i++) grms_final[i]*=gsc;
+            adam_update(rms_final, grms_final, &arms_final, adam_t, lr, adam_b1, adam_b2, adam_eps);
+            // Scale and update embed
+            for(size_t i=0;i<(size_t)VOCAB*DIM;i++) gembed[i]*=gsc;
+            adam_update(embed, gembed, &aembed, adam_t, lr, adam_b1, adam_b2, adam_eps);
+
+            printf("  [batch %d: compile=%.0fms train=%.1fms (%.1fms/step) compiles=%d]\n",
+                   steps_batch, cms, tms, tms/steps_batch, g_compile_count);
+            printf("    ane=%.1f io=%.1f cls=%.1f elem=%.1f rms=%.1f cblas_wait=%.1f ms/step\n",
+                   t_ane/steps_batch, t_io/steps_batch, t_cls/steps_batch, t_elem/steps_batch,
+                   t_rms/steps_batch, t_cblas_wait/steps_batch);
+
+            // JSON batch telemetry to stderr
+            {
+                double bf = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
+                double bs = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD;
+                double ane_f_batch = (bf*2 + bs) * steps_batch;
+                double ane_tflops = ane_f_batch / (tms * 1e9);
+                fprintf(stderr, "{\"type\":\"batch\",\"batch\":%d,\"compile_ms\":%.1f,"
+                    "\"train_ms\":%.1f,\"ms_per_step\":%.1f}\n",
+                    steps_batch, cms, tms, tms/steps_batch);
+                fprintf(stderr, "{\"type\":\"perf\",\"ane_tflops\":%.3f,\"ane_util_pct\":%.2f}\n",
+                    ane_tflops, 100.0*ane_tflops/ane_peak_tflops());
+            }
+        }
+
+        // Efficiency report
+        double wall = tb_ms(mach_absolute_time() - t_wall_start);
+        total_compile_ms += cum_compile; total_train_ms += cum_train;
+        wall += cum_wall; total_steps_done += cum_steps; total_batches += cum_batches;
+        double fwd_flops = NLAYERS * (4.0*2*DIM*DIM*SEQ + 2.0*2*DIM*HIDDEN*SEQ + 2.0*HIDDEN*DIM*SEQ);
+        double sdpa_flops = NLAYERS * 2.0*HEADS*5*SEQ*SEQ*HD;
+        double cls_flops = 2.0*VOCAB*DIM*SEQ;
+        double total_flops = (fwd_flops*3 + sdpa_flops + cls_flops*3) * total_steps_done;
+        double ane_flops = (fwd_flops*2 + sdpa_flops) * total_steps_done;
+        printf("\n=== Efficiency Report ===\n");
+        printf("Total steps:     %d\n", total_steps_done);
+        printf("Wall time:       %.0f ms (%.1f s)\n", wall, wall/1000);
+        printf("Compile time:    %.0f ms (%.1f%%)\n", total_compile_ms, 100*total_compile_ms/wall);
+        printf("Train time:      %.0f ms (%.1f%%)\n", total_train_ms, 100*total_train_ms/wall);
+        printf("Avg train:       %.1f ms/step\n", total_train_ms/total_steps_done);
+        printf("ANE TFLOPS:      %.2f sustained\n", ane_flops / (total_train_ms * 1e9));
+        printf("Total TFLOPS:    %.2f (ANE+CPU)\n", total_flops / (total_train_ms * 1e9));
+        printf("ANE utilization: %.1f%% of %.1f TFLOPS\n", 100*ane_flops/(total_train_ms*1e9)/ane_peak_tflops(), ane_peak_tflops());
+
+        // Cleanup
+        for (int L=0; L<NLAYERS; L++) {
+            free_layer_kernels(&kern[L]);
+            free_kern(sdpaBwd2[L]);
+            layer_weights_free(&lw[L]);
+            layer_adam_free(&la[L]);
+            layer_acts_free(&acts[L]);
+            layer_grads_free(&grads[L]);
+        }
+        munmap(token_data, data_len);
+        close(data_fd);
+        free(rms_final); free(embed); free(grms_final); free(gembed);
+        adam_free(&arms_final); adam_free(&aembed);
+        free(dy); free(dffn); free(dh1); free(dh3); free(dx_ffn); free(dx2);
+        free(do_out_buf); free(dq); free(dk); free(dv); free(dx_attn);
+        free(x_cur); free(x_final); free(logits); free(dlogits);
+    }
+    return 0;
+}