dermitchell1993 · codegen-sh · Mar 2, 2026 · Mar 3, 2026 · Mar 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,20 @@
+*.o
+ane_probe
+api_explore
+inmem_basic
+inmem_peak
+tiny_train
+tiny_train_m1
+train
+train_large
+test_weight_reload
+test_perf_stats
+test_qos_sweep
+test_ane_advanced
+test_ane_causal_attn
+test_ane_sdpa5
+test_conv_attn3
+test_full_fused
+test_fused_bwd
+test_fused_qkv
+
diff --git a/inmem_peak.m b/inmem_peak.m
@@ -5,9 +5,11 @@
 #import <dlfcn.h>
 #import <mach/mach_time.h>
 #import <IOSurface/IOSurface.h>
+#include "ane_compat.h"
 
 static mach_timebase_info_data_t g_tb;
 static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
 
 NSData *buildWeightBlob(int ch, int depth) {
     NSUInteger wsize = ch * ch * 2;
@@ -27,28 +29,47 @@
 
 NSString *genMIL(int ch, int sp, int depth) {
     NSMutableString *m = [NSMutableString string];
-    [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
-    [m appendFormat:@"    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp];
-    [m appendString:@"            string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n"
-        @"            tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
-        @"            tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        @"            tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
-        @"            int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n"
-        @"            string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"];
-    [m appendFormat:@"            tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp];
+    [m appendString:@"program(%s)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"];
+    if (g_fp16_io) {
+        // fp16 I/O path — no cast ops (M1/M2 compatible)
+        [m appendFormat:@"    func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), ch, sp];
+    } else {
+        // fp32 I/O path — cast to/from fp16 internally (M4+ native)
+        [m appendFormat:@"    func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ane_mil_target(), ch, sp];
+    }
+    [m appendString:
+        @"        tensor<string, []> c_pad_type_0 = const()[name = tensor<string, []>(\"c_pad_type_0\"), val = tensor<string, []>(\"valid\")];\n"
+        @"        tensor<int32, [2]> c_strides_0 = const()[name = tensor<string, []>(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"        tensor<int32, [4]> c_pad_0 = const()[name = tensor<string, []>(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        @"        tensor<int32, [2]> c_dilations_0 = const()[name = tensor<string, []>(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
+        @"        tensor<int32, []> c_groups_0 = const()[name = tensor<string, []>(\"c_groups_0\"), val = tensor<int32, []>(1)];\n"];
+    NSString *prev;
+    if (g_fp16_io) {
+        prev = @"x";
+    } else {
+        [m appendString:@"        tensor<string, []> x_to_fp16_dtype_0 = const()[name = tensor<string, []>(\"x_to_fp16_dtype_0\"), val = tensor<string, []>(\"fp16\")];\n"];
+        [m appendFormat:@"        tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor<string, []>(\"cast_in\")];\n", ch, sp];
+        prev = @"x_to_fp16";
+    }
     NSUInteger cs = 64 + ch*ch*2;
-    NSString *prev = @"x_to_fp16";
     for (int i = 0; i < depth; i++) {
-        [m appendFormat:@"            tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = string(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n",
+        [m appendFormat:@"        tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = tensor<string, []>(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n",
+            g_ane_platform.mil_program, ane_mil_target(),
             ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)];
         NSString *out = [NSString stringWithFormat:@"c%d", i];
-        [m appendFormat:@"            tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n",
+        [m appendFormat:@"        tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor<string, []>(\"%@\")];\n",
+            g_ane_platform.mil_program, ane_mil_target(),
             ch, sp, out, i, prev, out];
         prev = out;
     }
-    [m appendString:@"            string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"];
-    [m appendFormat:@"            tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev];
-    [m appendString:@"        } -> (c);\n}\n"];
+    if (g_fp16_io) {
+        [m appendFormat:@"        tensor<fp16, [1, %d, 1, %d]> c = identity(x = %@)[name = tensor<string, []>(\"out\")];\n", ch, sp, prev];
+        [m appendString:@"    } -> (c);\n}\n"];
+    } else {
+        [m appendString:@"        tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"];
+        [m appendFormat:@"        tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = tensor<string, []>(\"cast_out\")];\n", ch, sp, prev];
+        [m appendString:@"    } -> (c);\n}\n"];
+    }
     return m;
 }
 
@@ -68,9 +89,18 @@
         [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
         [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
         [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
-        if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;}
+        if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){
+            [fm removeItemAtPath:td error:nil];
+            if (!g_fp16_io) {
+                printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+                g_fp16_io = 1;
+                return bench(ch, sp, depth);
+            }
+            return -3;
+        }
         if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;}
-        NSUInteger bytes=ch*sp*4;
+        size_t bpe = g_fp16_io ? 2 : 4;
+        NSUInteger bytes=ch*sp*bpe;
         IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
         IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
         id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI);
@@ -87,6 +117,7 @@
 }
 
 int main() {
+    ane_detect_platform(); ane_print_platform();
     mach_timebase_info(&g_tb);
     dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",RTLD_NOW);
     printf("=== Programmatic MIL → In-Memory ANE Peak ===\n\n");
@@ -104,7 +135,7 @@ int main() {
         char l[64]; snprintf(l,64,"%dx conv %dch sp%d",d,c,s);
         double ms=bench(c,s,d);
         double tf=ms>0?gf/ms:0;
-        if(ms>0)printf("%-28s %6.1f  %6.2f  %7.3f ms %6.2f  %5.1f%%\n",l,w,gf,ms,tf,tf/0.019*100);
+        if(ms>0)printf("%-28s %6.1f  %6.2f  %7.3f ms %6.2f  %5.1f%%\n",l,w,gf,ms,tf,tf/ane_peak_tflops()*100);
         else printf("%-28s %6.1f  %6.2f  FAIL(%.0f)\n",l,w,gf,ms);
     }
     return 0;

diff --git a/training/ane_compat.h b/training/ane_compat.h
@@ -0,0 +1,223 @@
+// ane_compat.h — Runtime platform detection for Apple Silicon ANE compatibility
+// Detects chip family, macOS version, ANE peak TFLOPS, and appropriate MIL target
+#pragma once
+#import <Foundation/Foundation.h>
+#include <sys/sysctl.h>
+#include <string.h>
+#include <stdio.h>
+
+// Chip family enumeration
+typedef enum {
+    ANE_CHIP_UNKNOWN = 0,
+    ANE_CHIP_M1,
+    ANE_CHIP_M1_PRO,
+    ANE_CHIP_M1_MAX,
+    ANE_CHIP_M1_ULTRA,
+    ANE_CHIP_M2,
+    ANE_CHIP_M2_PRO,
+    ANE_CHIP_M2_MAX,
+    ANE_CHIP_M2_ULTRA,
+    ANE_CHIP_M3,
+    ANE_CHIP_M3_PRO,
+    ANE_CHIP_M3_MAX,
+    ANE_CHIP_M3_ULTRA,
+    ANE_CHIP_M4,
+    ANE_CHIP_M4_PRO,
+    ANE_CHIP_M4_MAX,
+    ANE_CHIP_M4_ULTRA,
+    ANE_CHIP_M5,
+    ANE_CHIP_M5_PRO,
+    ANE_CHIP_M5_MAX,
+    ANE_CHIP_M5_ULTRA,
+} ANEChipFamily;
+
+// Platform info resolved at runtime
+typedef struct {
+    ANEChipFamily chip;
+    char chip_name[64];       // e.g. "Apple M4"
+    int macos_major;          // e.g. 14, 15
+    int macos_minor;          // e.g. 0, 1
+    double ane_peak_tflops;   // Estimated FP16 peak TFLOPS
+    const char *mil_target;   // "ios16", "ios17", or "ios18"
+    const char *mil_program;  // "1.0" for ios16/17, "1.3" for ios18
+    bool api_available;       // Whether _ANEInMemoryModel is available
+} ANEPlatform;
+
+// Global platform info (set once by ane_detect_platform)
+static ANEPlatform g_ane_platform = {0};
+static bool g_ane_platform_detected = false;
+
+// ---- Internal helpers ----
+
+static ANEChipFamily _ane_identify_chip(const char *brand) {
+    // Match chip family from sysctl brand string (e.g. "Apple M4", "Apple M2 Pro")
+    if (strstr(brand, "M5 Ultra"))  return ANE_CHIP_M5_ULTRA;
+    if (strstr(brand, "M5 Max"))    return ANE_CHIP_M5_MAX;
+    if (strstr(brand, "M5 Pro"))    return ANE_CHIP_M5_PRO;
+    if (strstr(brand, "M5"))        return ANE_CHIP_M5;
+    if (strstr(brand, "M4 Ultra"))  return ANE_CHIP_M4_ULTRA;
+    if (strstr(brand, "M4 Max"))    return ANE_CHIP_M4_MAX;
+    if (strstr(brand, "M4 Pro"))    return ANE_CHIP_M4_PRO;
+    if (strstr(brand, "M4"))        return ANE_CHIP_M4;
+    if (strstr(brand, "M3 Ultra"))  return ANE_CHIP_M3_ULTRA;
+    if (strstr(brand, "M3 Max"))    return ANE_CHIP_M3_MAX;
+    if (strstr(brand, "M3 Pro"))    return ANE_CHIP_M3_PRO;
+    if (strstr(brand, "M3"))        return ANE_CHIP_M3;
+    if (strstr(brand, "M2 Ultra"))  return ANE_CHIP_M2_ULTRA;
+    if (strstr(brand, "M2 Max"))    return ANE_CHIP_M2_MAX;
+    if (strstr(brand, "M2 Pro"))    return ANE_CHIP_M2_PRO;
+    if (strstr(brand, "M2"))        return ANE_CHIP_M2;
+    if (strstr(brand, "M1 Ultra"))  return ANE_CHIP_M1_ULTRA;
+    if (strstr(brand, "M1 Max"))    return ANE_CHIP_M1_MAX;
+    if (strstr(brand, "M1 Pro"))    return ANE_CHIP_M1_PRO;
+    if (strstr(brand, "M1"))        return ANE_CHIP_M1;
+    return ANE_CHIP_UNKNOWN;
+}
+
+// Estimated FP16 ANE peak TFLOPS per chip.
+// Apple publishes INT8 TOPS; FP16 throughput is roughly half.
+// Values are best-effort estimates from known hardware specs.
+// Ultra variants double the base die's ANE (2x neural engines).
+static double _ane_peak_tflops(ANEChipFamily chip) {
+    switch (chip) {
+        case ANE_CHIP_M1:       return 5.5;
+        case ANE_CHIP_M1_PRO:   return 5.5;
+        case ANE_CHIP_M1_MAX:   return 5.5;
+        case ANE_CHIP_M1_ULTRA: return 11.0;
+        case ANE_CHIP_M2:       return 7.9;   // 15.8 TOPS / 2
+        case ANE_CHIP_M2_PRO:   return 7.9;
+        case ANE_CHIP_M2_MAX:   return 7.9;
+        case ANE_CHIP_M2_ULTRA: return 15.8;
+        case ANE_CHIP_M3:       return 9.0;   // 18 TOPS / 2
+        case ANE_CHIP_M3_PRO:   return 9.0;
+        case ANE_CHIP_M3_MAX:   return 9.0;
+        case ANE_CHIP_M3_ULTRA: return 18.0;
+        case ANE_CHIP_M4:       return 15.8;  // Empirically measured in this project
+        case ANE_CHIP_M4_PRO:   return 15.8;
+        case ANE_CHIP_M4_MAX:   return 15.8;
+        case ANE_CHIP_M4_ULTRA: return 31.6;
+        case ANE_CHIP_M5:       return 19.0;  // 38 TOPS / 2 (estimate)
+        case ANE_CHIP_M5_PRO:   return 19.0;
+        case ANE_CHIP_M5_MAX:   return 19.0;
+        case ANE_CHIP_M5_ULTRA: return 38.0;
+        default:                return 15.8;  // Fallback: assume M4-class
+    }
+}
+
+static const char *_ane_chip_name_str(ANEChipFamily chip) {
+    switch (chip) {
+        case ANE_CHIP_M1:       return "M1";
+        case ANE_CHIP_M1_PRO:   return "M1 Pro";
+        case ANE_CHIP_M1_MAX:   return "M1 Max";
+        case ANE_CHIP_M1_ULTRA: return "M1 Ultra";
+        case ANE_CHIP_M2:       return "M2";
+        case ANE_CHIP_M2_PRO:   return "M2 Pro";
+        case ANE_CHIP_M2_MAX:   return "M2 Max";
+        case ANE_CHIP_M2_ULTRA: return "M2 Ultra";
+        case ANE_CHIP_M3:       return "M3";
+        case ANE_CHIP_M3_PRO:   return "M3 Pro";
+        case ANE_CHIP_M3_MAX:   return "M3 Max";
+        case ANE_CHIP_M3_ULTRA: return "M3 Ultra";
+        case ANE_CHIP_M4:       return "M4";
+        case ANE_CHIP_M4_PRO:   return "M4 Pro";
+        case ANE_CHIP_M4_MAX:   return "M4 Max";
+        case ANE_CHIP_M4_ULTRA: return "M4 Ultra";
+        case ANE_CHIP_M5:       return "M5";
+        case ANE_CHIP_M5_PRO:   return "M5 Pro";
+        case ANE_CHIP_M5_MAX:   return "M5 Max";
+        case ANE_CHIP_M5_ULTRA: return "M5 Ultra";
+        default:                return "Unknown";
+    }
+}
+
+// ---- Public API ----
+
+// Detect the current platform. Call once at startup.
+// Returns the populated ANEPlatform struct (also stored in g_ane_platform).
+static ANEPlatform ane_detect_platform(void) {
+    if (g_ane_platform_detected) return g_ane_platform;
+
+    ANEPlatform p = {0};
+
+    // 1. Detect chip via sysctl
+    char brand[128] = {0};
+    size_t len = sizeof(brand);
+    if (sysctlbyname("machdep.cpu.brand_string", brand, &len, NULL, 0) != 0) {
+        // Fallback: try hw.machine or hw.model
+        len = sizeof(brand);
+        sysctlbyname("hw.model", brand, &len, NULL, 0);
+    }
+    strncpy(p.chip_name, brand, sizeof(p.chip_name) - 1);
+    p.chip = _ane_identify_chip(brand);
+
+    // 2. Detect macOS version
+    NSOperatingSystemVersion ver = [[NSProcessInfo processInfo] operatingSystemVersion];
+    p.macos_major = (int)ver.majorVersion;
+    p.macos_minor = (int)ver.minorVersion;
+
+    // 3. Set ANE peak TFLOPS
+    p.ane_peak_tflops = _ane_peak_tflops(p.chip);
+
+    // 4. Select MIL target based on macOS version
+    //    - macOS 15+ (Sequoia)  → ios18 + program(1.3)
+    //    - macOS 14  (Sonoma)   → ios17 + program(1.0)
+    //    - macOS 13  (Ventura)  → ios16 + program(1.0)
+    //    - older                → unsupported
+    if (p.macos_major >= 15) {
+        p.mil_target = "ios18";
+        p.mil_program = "1.3";
+    } else if (p.macos_major == 14) {
+        p.mil_target = "ios17";
+        p.mil_program = "1.0";
+    } else if (p.macos_major == 13) {
+        p.mil_target = "ios16";
+        p.mil_program = "1.0";
+    } else {
+        p.mil_target = "ios16";
+        p.mil_program = "1.0";
+    }
+
+    // 5. Check API availability
+    p.api_available = (NSClassFromString(@"_ANEInMemoryModelDescriptor") != nil &&
+                       NSClassFromString(@"_ANEInMemoryModel") != nil);
+
+    g_ane_platform = p;
+    g_ane_platform_detected = true;
+    return p;
+}
+
+// Print detected platform info (call after ane_detect_platform)
+static void ane_print_platform(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    const ANEPlatform *p = &g_ane_platform;
+    printf("=== ANE Platform ===\n");
+    printf("  Chip:       %s (%s)\n", _ane_chip_name_str(p->chip), p->chip_name);
+    printf("  macOS:      %d.%d\n", p->macos_major, p->macos_minor);
+    printf("  ANE peak:   %.1f TFLOPS (FP16 est.)\n", p->ane_peak_tflops);
+    printf("  MIL target: %s (program %s)\n", p->mil_target, p->mil_program);
+    printf("  API ready:  %s\n", p->api_available ? "YES" : "NO");
+    printf("====================\n");
+}
+
+// Generate the MIL header string with correct program version and build info.
+// Uses canonical verbose syntax compatible with all CoreML versions.
+// Returns an autoreleased NSString.
+static NSString *ane_mil_header(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    return [NSString stringWithFormat:
+        @"program(%s)\n"
+        "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n",
+        g_ane_platform.mil_program];
+}
+
+// Get the MIL function target annotation (e.g. "ios17" or "ios18")
+static const char *ane_mil_target(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    return g_ane_platform.mil_target;
+}
+
+// Get the ANE peak TFLOPS for utilization calculations
+static double ane_peak_tflops(void) {
+    if (!g_ane_platform_detected) ane_detect_platform();
+    return g_ane_platform.ane_peak_tflops;
+}