Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
*.o
ane_probe
api_explore
inmem_basic
inmem_peak
tiny_train
tiny_train_m1
train
train_large
test_weight_reload
test_perf_stats
test_qos_sweep
test_ane_advanced
test_ane_causal_attn
test_ane_sdpa5
test_conv_attn3
test_full_fused
test_fused_bwd
test_fused_qkv

67 changes: 49 additions & 18 deletions inmem_peak.m
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
#import <dlfcn.h>
#import <mach/mach_time.h>
#import <IOSurface/IOSurface.h>
#include "ane_compat.h"

static mach_timebase_info_data_t g_tb;
static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly

NSData *buildWeightBlob(int ch, int depth) {
NSUInteger wsize = ch * ch * 2;
Expand All @@ -27,28 +29,47 @@

NSString *genMIL(int ch, int sp, int depth) {
NSMutableString *m = [NSMutableString string];
[m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
[m appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp];
[m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n"
@" tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
@" tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n"
@" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"];
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp];
[m appendString:@"program(%s)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"];
if (g_fp16_io) {
// fp16 I/O path — no cast ops (M1/M2 compatible)
[m appendFormat:@" func main<%s>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ane_mil_target(), ch, sp];
} else {
// fp32 I/O path — cast to/from fp16 internally (M4+ native)
[m appendFormat:@" func main<%s>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ane_mil_target(), ch, sp];
}
[m appendString:
@" tensor<string, []> c_pad_type_0 = const()[name = tensor<string, []>(\"c_pad_type_0\"), val = tensor<string, []>(\"valid\")];\n"
@" tensor<int32, [2]> c_strides_0 = const()[name = tensor<string, []>(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" tensor<int32, [4]> c_pad_0 = const()[name = tensor<string, []>(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
@" tensor<int32, [2]> c_dilations_0 = const()[name = tensor<string, []>(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
@" tensor<int32, []> c_groups_0 = const()[name = tensor<string, []>(\"c_groups_0\"), val = tensor<int32, []>(1)];\n"];
NSString *prev;
if (g_fp16_io) {
prev = @"x";
} else {
[m appendString:@" tensor<string, []> x_to_fp16_dtype_0 = const()[name = tensor<string, []>(\"x_to_fp16_dtype_0\"), val = tensor<string, []>(\"fp16\")];\n"];
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor<string, []>(\"cast_in\")];\n", ch, sp];
prev = @"x_to_fp16";
}
NSUInteger cs = 64 + ch*ch*2;
NSString *prev = @"x_to_fp16";
for (int i = 0; i < depth; i++) {
[m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = string(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n",
[m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = tensor<string, []>(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n",
g_ane_platform.mil_program, ane_mil_target(),
ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)];
NSString *out = [NSString stringWithFormat:@"c%d", i];
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n",
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor<string, []>(\"%@\")];\n",
g_ane_platform.mil_program, ane_mil_target(),
ch, sp, out, i, prev, out];
prev = out;
}
[m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"];
[m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev];
[m appendString:@" } -> (c);\n}\n"];
if (g_fp16_io) {
[m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> c = identity(x = %@)[name = tensor<string, []>(\"out\")];\n", ch, sp, prev];
[m appendString:@" } -> (c);\n}\n"];
} else {
[m appendString:@" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"];
[m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = tensor<string, []>(\"cast_out\")];\n", ch, sp, prev];
[m appendString:@" } -> (c);\n}\n"];
}
return m;
}

Expand All @@ -68,9 +89,18 @@
[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
[wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;}
if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){
[fm removeItemAtPath:td error:nil];
if (!g_fp16_io) {
printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
g_fp16_io = 1;
return bench(ch, sp, depth);
}
return -3;
}
if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;}
NSUInteger bytes=ch*sp*4;
size_t bpe = g_fp16_io ? 2 : 4;
NSUInteger bytes=ch*sp*bpe;
IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI);
Expand All @@ -87,6 +117,7 @@
}

int main() {
ane_detect_platform(); ane_print_platform();
mach_timebase_info(&g_tb);
dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",RTLD_NOW);
printf("=== Programmatic MIL → In-Memory ANE Peak ===\n\n");
Expand All @@ -104,7 +135,7 @@ int main() {
char l[64]; snprintf(l,64,"%dx conv %dch sp%d",d,c,s);
double ms=bench(c,s,d);
double tf=ms>0?gf/ms:0;
if(ms>0)printf("%-28s %6.1f %6.2f %7.3f ms %6.2f %5.1f%%\n",l,w,gf,ms,tf,tf/0.019*100);
if(ms>0)printf("%-28s %6.1f %6.2f %7.3f ms %6.2f %5.1f%%\n",l,w,gf,ms,tf,tf/ane_peak_tflops()*100);
else printf("%-28s %6.1f %6.2f FAIL(%.0f)\n",l,w,gf,ms);
}
return 0;
Expand Down
223 changes: 223 additions & 0 deletions training/ane_compat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
// ane_compat.h — Runtime platform detection for Apple Silicon ANE compatibility
// Detects chip family, macOS version, ANE peak TFLOPS, and appropriate MIL target
#pragma once
#import <Foundation/Foundation.h>
#include <sys/sysctl.h>
#include <string.h>
#include <stdio.h>

// Chip family enumeration
typedef enum {
ANE_CHIP_UNKNOWN = 0,
ANE_CHIP_M1,
ANE_CHIP_M1_PRO,
ANE_CHIP_M1_MAX,
ANE_CHIP_M1_ULTRA,
ANE_CHIP_M2,
ANE_CHIP_M2_PRO,
ANE_CHIP_M2_MAX,
ANE_CHIP_M2_ULTRA,
ANE_CHIP_M3,
ANE_CHIP_M3_PRO,
ANE_CHIP_M3_MAX,
ANE_CHIP_M3_ULTRA,
ANE_CHIP_M4,
ANE_CHIP_M4_PRO,
ANE_CHIP_M4_MAX,
ANE_CHIP_M4_ULTRA,
ANE_CHIP_M5,
ANE_CHIP_M5_PRO,
ANE_CHIP_M5_MAX,
ANE_CHIP_M5_ULTRA,
} ANEChipFamily;

// Platform info resolved at runtime
typedef struct {
ANEChipFamily chip;
char chip_name[64]; // e.g. "Apple M4"
int macos_major; // e.g. 14, 15
int macos_minor; // e.g. 0, 1
double ane_peak_tflops; // Estimated FP16 peak TFLOPS
const char *mil_target; // "ios16", "ios17", or "ios18"
const char *mil_program; // "1.0" for ios16/17, "1.3" for ios18
bool api_available; // Whether _ANEInMemoryModel is available
} ANEPlatform;

// Global platform info (set once by ane_detect_platform)
static ANEPlatform g_ane_platform = {0};
static bool g_ane_platform_detected = false;

// ---- Internal helpers ----

static ANEChipFamily _ane_identify_chip(const char *brand) {
// Match chip family from sysctl brand string (e.g. "Apple M4", "Apple M2 Pro")
if (strstr(brand, "M5 Ultra")) return ANE_CHIP_M5_ULTRA;
if (strstr(brand, "M5 Max")) return ANE_CHIP_M5_MAX;
if (strstr(brand, "M5 Pro")) return ANE_CHIP_M5_PRO;
if (strstr(brand, "M5")) return ANE_CHIP_M5;
if (strstr(brand, "M4 Ultra")) return ANE_CHIP_M4_ULTRA;
if (strstr(brand, "M4 Max")) return ANE_CHIP_M4_MAX;
if (strstr(brand, "M4 Pro")) return ANE_CHIP_M4_PRO;
if (strstr(brand, "M4")) return ANE_CHIP_M4;
if (strstr(brand, "M3 Ultra")) return ANE_CHIP_M3_ULTRA;
if (strstr(brand, "M3 Max")) return ANE_CHIP_M3_MAX;
if (strstr(brand, "M3 Pro")) return ANE_CHIP_M3_PRO;
if (strstr(brand, "M3")) return ANE_CHIP_M3;
if (strstr(brand, "M2 Ultra")) return ANE_CHIP_M2_ULTRA;
if (strstr(brand, "M2 Max")) return ANE_CHIP_M2_MAX;
if (strstr(brand, "M2 Pro")) return ANE_CHIP_M2_PRO;
if (strstr(brand, "M2")) return ANE_CHIP_M2;
if (strstr(brand, "M1 Ultra")) return ANE_CHIP_M1_ULTRA;
if (strstr(brand, "M1 Max")) return ANE_CHIP_M1_MAX;
if (strstr(brand, "M1 Pro")) return ANE_CHIP_M1_PRO;
if (strstr(brand, "M1")) return ANE_CHIP_M1;
return ANE_CHIP_UNKNOWN;
}

// Estimated FP16 ANE peak TFLOPS per chip.
// Apple publishes INT8 TOPS; FP16 throughput is roughly half.
// Values are best-effort estimates from known hardware specs.
// Ultra variants double the base die's ANE (2x neural engines).
static double _ane_peak_tflops(ANEChipFamily chip) {
switch (chip) {
case ANE_CHIP_M1: return 5.5;
case ANE_CHIP_M1_PRO: return 5.5;
case ANE_CHIP_M1_MAX: return 5.5;
case ANE_CHIP_M1_ULTRA: return 11.0;
case ANE_CHIP_M2: return 7.9; // 15.8 TOPS / 2
case ANE_CHIP_M2_PRO: return 7.9;
case ANE_CHIP_M2_MAX: return 7.9;
case ANE_CHIP_M2_ULTRA: return 15.8;
case ANE_CHIP_M3: return 9.0; // 18 TOPS / 2
case ANE_CHIP_M3_PRO: return 9.0;
case ANE_CHIP_M3_MAX: return 9.0;
case ANE_CHIP_M3_ULTRA: return 18.0;
case ANE_CHIP_M4: return 15.8; // Empirically measured in this project
case ANE_CHIP_M4_PRO: return 15.8;
case ANE_CHIP_M4_MAX: return 15.8;
case ANE_CHIP_M4_ULTRA: return 31.6;
case ANE_CHIP_M5: return 19.0; // 38 TOPS / 2 (estimate)
case ANE_CHIP_M5_PRO: return 19.0;
case ANE_CHIP_M5_MAX: return 19.0;
case ANE_CHIP_M5_ULTRA: return 38.0;
default: return 15.8; // Fallback: assume M4-class
}
}

static const char *_ane_chip_name_str(ANEChipFamily chip) {
switch (chip) {
case ANE_CHIP_M1: return "M1";
case ANE_CHIP_M1_PRO: return "M1 Pro";
case ANE_CHIP_M1_MAX: return "M1 Max";
case ANE_CHIP_M1_ULTRA: return "M1 Ultra";
case ANE_CHIP_M2: return "M2";
case ANE_CHIP_M2_PRO: return "M2 Pro";
case ANE_CHIP_M2_MAX: return "M2 Max";
case ANE_CHIP_M2_ULTRA: return "M2 Ultra";
case ANE_CHIP_M3: return "M3";
case ANE_CHIP_M3_PRO: return "M3 Pro";
case ANE_CHIP_M3_MAX: return "M3 Max";
case ANE_CHIP_M3_ULTRA: return "M3 Ultra";
case ANE_CHIP_M4: return "M4";
case ANE_CHIP_M4_PRO: return "M4 Pro";
case ANE_CHIP_M4_MAX: return "M4 Max";
case ANE_CHIP_M4_ULTRA: return "M4 Ultra";
case ANE_CHIP_M5: return "M5";
case ANE_CHIP_M5_PRO: return "M5 Pro";
case ANE_CHIP_M5_MAX: return "M5 Max";
case ANE_CHIP_M5_ULTRA: return "M5 Ultra";
default: return "Unknown";
}
}

// ---- Public API ----

// Detect the current platform. Call once at startup.
// Returns the populated ANEPlatform struct (also stored in g_ane_platform).
static ANEPlatform ane_detect_platform(void) {
if (g_ane_platform_detected) return g_ane_platform;

ANEPlatform p = {0};

// 1. Detect chip via sysctl
char brand[128] = {0};
size_t len = sizeof(brand);
if (sysctlbyname("machdep.cpu.brand_string", brand, &len, NULL, 0) != 0) {
// Fallback: try hw.machine or hw.model
len = sizeof(brand);
sysctlbyname("hw.model", brand, &len, NULL, 0);
}
strncpy(p.chip_name, brand, sizeof(p.chip_name) - 1);
p.chip = _ane_identify_chip(brand);

// 2. Detect macOS version
NSOperatingSystemVersion ver = [[NSProcessInfo processInfo] operatingSystemVersion];
p.macos_major = (int)ver.majorVersion;
p.macos_minor = (int)ver.minorVersion;

// 3. Set ANE peak TFLOPS
p.ane_peak_tflops = _ane_peak_tflops(p.chip);

// 4. Select MIL target based on macOS version
// - macOS 15+ (Sequoia) → ios18 + program(1.3)
// - macOS 14 (Sonoma) → ios17 + program(1.0)
// - macOS 13 (Ventura) → ios16 + program(1.0)
// - older → unsupported
if (p.macos_major >= 15) {
p.mil_target = "ios18";
p.mil_program = "1.3";
} else if (p.macos_major == 14) {
p.mil_target = "ios17";
p.mil_program = "1.0";
} else if (p.macos_major == 13) {
p.mil_target = "ios16";
p.mil_program = "1.0";
} else {
p.mil_target = "ios16";
p.mil_program = "1.0";
}

// 5. Check API availability
p.api_available = (NSClassFromString(@"_ANEInMemoryModelDescriptor") != nil &&
NSClassFromString(@"_ANEInMemoryModel") != nil);

g_ane_platform = p;
g_ane_platform_detected = true;
return p;
}

// Print detected platform info (call after ane_detect_platform)
static void ane_print_platform(void) {
if (!g_ane_platform_detected) ane_detect_platform();
const ANEPlatform *p = &g_ane_platform;
printf("=== ANE Platform ===\n");
printf(" Chip: %s (%s)\n", _ane_chip_name_str(p->chip), p->chip_name);
printf(" macOS: %d.%d\n", p->macos_major, p->macos_minor);
printf(" ANE peak: %.1f TFLOPS (FP16 est.)\n", p->ane_peak_tflops);
printf(" MIL target: %s (program %s)\n", p->mil_target, p->mil_program);
printf(" API ready: %s\n", p->api_available ? "YES" : "NO");
printf("====================\n");
}

// Generate the MIL header string with correct program version and build info.
// Uses canonical verbose syntax compatible with all CoreML versions.
// Returns an autoreleased NSString.
static NSString *ane_mil_header(void) {
if (!g_ane_platform_detected) ane_detect_platform();
return [NSString stringWithFormat:
@"program(%s)\n"
"[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n",
g_ane_platform.mil_program];
}

// Get the MIL function target annotation (e.g. "ios17" or "ios18")
static const char *ane_mil_target(void) {
if (!g_ane_platform_detected) ane_detect_platform();
return g_ane_platform.mil_target;
}

// Get the ANE peak TFLOPS for utilization calculations
static double ane_peak_tflops(void) {
if (!g_ane_platform_detected) ane_detect_platform();
return g_ane_platform.ane_peak_tflops;
}
Loading