diff --git a/inmem_peak.m b/inmem_peak.m index 87b8163..5cae23c 100644 --- a/inmem_peak.m +++ b/inmem_peak.m @@ -5,6 +5,7 @@ #import #import #import +#include "training/ane_compat.h" static mach_timebase_info_data_t g_tb; static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } @@ -27,8 +28,8 @@ NSString *genMIL(int ch, int sp, int depth) { NSMutableString *m = [NSMutableString string]; - [m appendString:@"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"]; - [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + [m appendFormat:@"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, {\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"\"}})]\n{\n", g_ane_platform.mil_program]; + [m appendFormat:@" func main<%s>(tensor x) {\n", ane_mil_target(), ch, sp]; [m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n" @" tensor c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor([1, 1])];\n" @" tensor c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" @@ -89,6 +90,8 @@ int main() { mach_timebase_info(&g_tb); dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",RTLD_NOW); + ane_detect_platform(); + ane_print_platform(); printf("=== Programmatic MIL → In-Memory ANE Peak ===\n\n"); printf("%-28s %7s %7s %9s %7s %6s\n","Config","W(MB)","GFLOP","ms/eval","TFLOPS","%%peak"); printf("----------------------------------------------------------------------\n"); @@ -104,7 +107,7 @@ int main() { char l[64]; snprintf(l,64,"%dx conv %dch sp%d",d,c,s); double ms=bench(c,s,d); double tf=ms>0?gf/ms:0; - if(ms>0)printf("%-28s %6.1f %6.2f %7.3f ms %6.2f %5.1f%%\n",l,w,gf,ms,tf,tf/0.019*100); + if(ms>0)printf("%-28s %6.1f %6.2f %7.3f ms %6.2f %5.1f%%\n",l,w,gf,ms,tf,tf/ane_peak_tflops()*100); else printf("%-28s %6.1f %6.2f FAIL(%.0f)\n",l,w,gf,ms); } return 0; diff --git a/training/ane_compat.h b/training/ane_compat.h new file mode 100644 index 0000000..8c5ed22 --- /dev/null +++ b/training/ane_compat.h @@ -0,0 +1,224 @@ +// ane_compat.h — Runtime platform detection for Apple Silicon ANE compatibility +// Detects chip family, macOS version, ANE peak TFLOPS, and appropriate MIL target +#pragma once +#import +#include +#include +#include + +// Chip family enumeration +typedef enum { + ANE_CHIP_UNKNOWN = 0, + ANE_CHIP_M1, + ANE_CHIP_M1_PRO, + ANE_CHIP_M1_MAX, + ANE_CHIP_M1_ULTRA, + ANE_CHIP_M2, + ANE_CHIP_M2_PRO, + ANE_CHIP_M2_MAX, + ANE_CHIP_M2_ULTRA, + ANE_CHIP_M3, + ANE_CHIP_M3_PRO, + ANE_CHIP_M3_MAX, + ANE_CHIP_M3_ULTRA, + ANE_CHIP_M4, + ANE_CHIP_M4_PRO, + ANE_CHIP_M4_MAX, + ANE_CHIP_M4_ULTRA, + ANE_CHIP_M5, + ANE_CHIP_M5_PRO, + ANE_CHIP_M5_MAX, + ANE_CHIP_M5_ULTRA, +} ANEChipFamily; + +// Platform info resolved at runtime +typedef struct { + ANEChipFamily chip; + char chip_name[64]; // e.g. "Apple M4" + int macos_major; // e.g. 14, 15 + int macos_minor; // e.g. 0, 1 + double ane_peak_tflops; // Estimated FP16 peak TFLOPS + const char *mil_target; // "ios16", "ios17", or "ios18" + const char *mil_program; // "1.0" for ios16/17, "1.3" for ios18 + bool api_available; // Whether _ANEInMemoryModel is available +} ANEPlatform; + +// Global platform info (set once by ane_detect_platform) +static ANEPlatform g_ane_platform = {0}; +static bool g_ane_platform_detected = false; + +// ---- Internal helpers ---- + +static ANEChipFamily _ane_identify_chip(const char *brand) { + // Match chip family from sysctl brand string (e.g. "Apple M4", "Apple M2 Pro") + if (strstr(brand, "M5 Ultra")) return ANE_CHIP_M5_ULTRA; + if (strstr(brand, "M5 Max")) return ANE_CHIP_M5_MAX; + if (strstr(brand, "M5 Pro")) return ANE_CHIP_M5_PRO; + if (strstr(brand, "M5")) return ANE_CHIP_M5; + if (strstr(brand, "M4 Ultra")) return ANE_CHIP_M4_ULTRA; + if (strstr(brand, "M4 Max")) return ANE_CHIP_M4_MAX; + if (strstr(brand, "M4 Pro")) return ANE_CHIP_M4_PRO; + if (strstr(brand, "M4")) return ANE_CHIP_M4; + if (strstr(brand, "M3 Ultra")) return ANE_CHIP_M3_ULTRA; + if (strstr(brand, "M3 Max")) return ANE_CHIP_M3_MAX; + if (strstr(brand, "M3 Pro")) return ANE_CHIP_M3_PRO; + if (strstr(brand, "M3")) return ANE_CHIP_M3; + if (strstr(brand, "M2 Ultra")) return ANE_CHIP_M2_ULTRA; + if (strstr(brand, "M2 Max")) return ANE_CHIP_M2_MAX; + if (strstr(brand, "M2 Pro")) return ANE_CHIP_M2_PRO; + if (strstr(brand, "M2")) return ANE_CHIP_M2; + if (strstr(brand, "M1 Ultra")) return ANE_CHIP_M1_ULTRA; + if (strstr(brand, "M1 Max")) return ANE_CHIP_M1_MAX; + if (strstr(brand, "M1 Pro")) return ANE_CHIP_M1_PRO; + if (strstr(brand, "M1")) return ANE_CHIP_M1; + return ANE_CHIP_UNKNOWN; +} + +// Estimated FP16 ANE peak TFLOPS per chip. +// Apple publishes INT8 TOPS; FP16 throughput is roughly half. +// Values are best-effort estimates from known hardware specs. +// Ultra variants double the base die's ANE (2x neural engines). +static double _ane_peak_tflops(ANEChipFamily chip) { + switch (chip) { + case ANE_CHIP_M1: return 5.5; + case ANE_CHIP_M1_PRO: return 5.5; + case ANE_CHIP_M1_MAX: return 5.5; + case ANE_CHIP_M1_ULTRA: return 11.0; + case ANE_CHIP_M2: return 7.9; // 15.8 TOPS / 2 + case ANE_CHIP_M2_PRO: return 7.9; + case ANE_CHIP_M2_MAX: return 7.9; + case ANE_CHIP_M2_ULTRA: return 15.8; + case ANE_CHIP_M3: return 9.0; // 18 TOPS / 2 + case ANE_CHIP_M3_PRO: return 9.0; + case ANE_CHIP_M3_MAX: return 9.0; + case ANE_CHIP_M3_ULTRA: return 18.0; + case ANE_CHIP_M4: return 15.8; // Empirically measured in this project + case ANE_CHIP_M4_PRO: return 15.8; + case ANE_CHIP_M4_MAX: return 15.8; + case ANE_CHIP_M4_ULTRA: return 31.6; + case ANE_CHIP_M5: return 19.0; // 38 TOPS / 2 (estimate) + case ANE_CHIP_M5_PRO: return 19.0; + case ANE_CHIP_M5_MAX: return 19.0; + case ANE_CHIP_M5_ULTRA: return 38.0; + default: return 15.8; // Fallback: assume M4-class + } +} + +static const char *_ane_chip_name_str(ANEChipFamily chip) { + switch (chip) { + case ANE_CHIP_M1: return "M1"; + case ANE_CHIP_M1_PRO: return "M1 Pro"; + case ANE_CHIP_M1_MAX: return "M1 Max"; + case ANE_CHIP_M1_ULTRA: return "M1 Ultra"; + case ANE_CHIP_M2: return "M2"; + case ANE_CHIP_M2_PRO: return "M2 Pro"; + case ANE_CHIP_M2_MAX: return "M2 Max"; + case ANE_CHIP_M2_ULTRA: return "M2 Ultra"; + case ANE_CHIP_M3: return "M3"; + case ANE_CHIP_M3_PRO: return "M3 Pro"; + case ANE_CHIP_M3_MAX: return "M3 Max"; + case ANE_CHIP_M3_ULTRA: return "M3 Ultra"; + case ANE_CHIP_M4: return "M4"; + case ANE_CHIP_M4_PRO: return "M4 Pro"; + case ANE_CHIP_M4_MAX: return "M4 Max"; + case ANE_CHIP_M4_ULTRA: return "M4 Ultra"; + case ANE_CHIP_M5: return "M5"; + case ANE_CHIP_M5_PRO: return "M5 Pro"; + case ANE_CHIP_M5_MAX: return "M5 Max"; + case ANE_CHIP_M5_ULTRA: return "M5 Ultra"; + default: return "Unknown"; + } +} + +// ---- Public API ---- + +// Detect the current platform. Call once at startup. +// Returns the populated ANEPlatform struct (also stored in g_ane_platform). +static ANEPlatform ane_detect_platform(void) { + if (g_ane_platform_detected) return g_ane_platform; + + ANEPlatform p = {0}; + + // 1. Detect chip via sysctl + char brand[128] = {0}; + size_t len = sizeof(brand); + if (sysctlbyname("machdep.cpu.brand_string", brand, &len, NULL, 0) != 0) { + // Fallback: try hw.machine or hw.model + len = sizeof(brand); + sysctlbyname("hw.model", brand, &len, NULL, 0); + } + strncpy(p.chip_name, brand, sizeof(p.chip_name) - 1); + p.chip = _ane_identify_chip(brand); + + // 2. Detect macOS version + NSOperatingSystemVersion ver = [[NSProcessInfo processInfo] operatingSystemVersion]; + p.macos_major = (int)ver.majorVersion; + p.macos_minor = (int)ver.minorVersion; + + // 3. Set ANE peak TFLOPS + p.ane_peak_tflops = _ane_peak_tflops(p.chip); + + // 4. Select MIL target based on macOS version + // - macOS 15+ (Sequoia) → ios18 + program(1.3) + // - macOS 14 (Sonoma) → ios17 + program(1.0) + // - macOS 13 (Ventura) → ios16 + program(1.0) + // - older → unsupported + if (p.macos_major >= 15) { + p.mil_target = "ios18"; + p.mil_program = "1.3"; + } else if (p.macos_major == 14) { + p.mil_target = "ios17"; + p.mil_program = "1.0"; + } else if (p.macos_major == 13) { + p.mil_target = "ios16"; + p.mil_program = "1.0"; + } else { + p.mil_target = "ios16"; + p.mil_program = "1.0"; + } + + // 5. Check API availability + p.api_available = (NSClassFromString(@"_ANEInMemoryModelDescriptor") != nil && + NSClassFromString(@"_ANEInMemoryModel") != nil); + + g_ane_platform = p; + g_ane_platform_detected = true; + return p; +} + +// Print detected platform info (call after ane_detect_platform) +static void ane_print_platform(void) { + if (!g_ane_platform_detected) ane_detect_platform(); + const ANEPlatform *p = &g_ane_platform; + printf("=== ANE Platform ===\n"); + printf(" Chip: %s (%s)\n", _ane_chip_name_str(p->chip), p->chip_name); + printf(" macOS: %d.%d\n", p->macos_major, p->macos_minor); + printf(" ANE peak: %.1f TFLOPS (FP16 est.)\n", p->ane_peak_tflops); + printf(" MIL target: %s (program %s)\n", p->mil_target, p->mil_program); + printf(" API ready: %s\n", p->api_available ? "YES" : "NO"); + printf("====================\n"); +} + +// Generate the MIL header string with correct program version and build info. +// Returns an autoreleased NSString. +static NSString *ane_mil_header(void) { + if (!g_ane_platform_detected) ane_detect_platform(); + return [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n", + g_ane_platform.mil_program]; +} + +// Get the MIL function target annotation (e.g. "ios17" or "ios18") +static const char *ane_mil_target(void) { + if (!g_ane_platform_detected) ane_detect_platform(); + return g_ane_platform.mil_target; +} + +// Get the ANE peak TFLOPS for utilization calculations +static double ane_peak_tflops(void) { + if (!g_ane_platform_detected) ane_detect_platform(); + return g_ane_platform.ane_peak_tflops; +} diff --git a/training/ane_mil_gen.h b/training/ane_mil_gen.h index 97fc451..80694b5 100644 --- a/training/ane_mil_gen.h +++ b/training/ane_mil_gen.h @@ -1,208 +1,213 @@ -// ane_mil_gen.h — Generate MIL text for conv-based linear ops + weight blobs -#pragma once -#import -#include -#include -#include - -// Build an FP16 weight blob with the required header structure. -// weights_f32: source weights in row-major [out_ch, in_ch] -// Returns NSData with header + FP16 weights -static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int in_ch) { - NSUInteger wsize = (NSUInteger)out_ch * in_ch * 2; // FP16 - NSUInteger total = 64 + 64 + wsize; // global header + chunk header + data - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0] = 0x01; buf[4] = 0x02; - uint8_t *chunk = buf + 64; - chunk[0] = 0xEF; chunk[1] = 0xBE; chunk[2] = 0xAD; chunk[3] = 0xDE; - chunk[4] = 0x01; - *(uint32_t*)(chunk + 8) = (uint32_t)wsize; // data_size - *(uint32_t*)(chunk + 16) = 128; // data_offset (from file start) - // Convert f32 → fp16 (simple truncation via _Float16) - _Float16 *fp16 = (_Float16*)(buf + 128); - for (NSUInteger i = 0; i < (NSUInteger)out_ch * in_ch; i++) - fp16[i] = (_Float16)weights_f32[i]; - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -// Generate MIL for a single matmul: y = W @ x (using matmul op, weights as input) -// Input x: [1, in_ch, spatial] fp32 -// Input W: [1, out_ch, in_ch] fp32 -// Output: [1, out_ch, spatial] fp32 -static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) { - return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x, tensor W) {\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n" - " tensor W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n" - " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" - " bool ty = const()[name = string(\"ty\"), val = bool(false)];\n" - " tensor y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" - " } -> (y);\n" - "}\n", - in_ch, spatial, out_ch, in_ch, - in_ch, spatial, out_ch, in_ch, - out_ch, spatial, out_ch, spatial]; -} - -// Keep the baked-weight version for reference (used in inference-only scenarios) -static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) { - return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " tensor y16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" - " } -> (y);\n" - "}\n", - in_ch, spatial, in_ch, spatial, - out_ch, in_ch, out_ch, in_ch, - out_ch, spatial, out_ch, spatial]; -} - -// Generate MIL for fused QKV: 3 parallel convs from same input -// Input: [1, dim, 1, S] -// Outputs: Q[1, dim, 1, S], K[1, dim, 1, S], V[1, dim, 1, S] -// Weight blob layout: Wq[dim,dim] @ offset 64, Wk @ offset 64+cs, Wv @ offset 64+2*cs -// where cs = 64 + dim*dim*2 -static NSString *mil_gen_qkv(int dim, int spatial) { - NSUInteger cs = 64 + (NSUInteger)dim * dim * 2; - return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor Wq = const()[name = string(\"Wq\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " tensor Wk = const()[name = string(\"Wk\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" - " tensor Wv = const()[name = string(\"Wv\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" - " tensor q16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n" - " tensor k16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n" - " tensor v16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n" - " tensor k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n" - " tensor v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n" - " } -> (q, k, v);\n" - "}\n", - dim, spatial, dim, spatial, - dim, dim, dim, dim, - dim, dim, dim, dim, (unsigned long)(64 + cs), - dim, dim, dim, dim, (unsigned long)(64 + 2*cs), - dim, spatial, dim, spatial, dim, spatial, - dim, spatial, dim, spatial, dim, spatial]; -} - -// Build weight blob for fused QKV (3 weight matrices concatenated) -static NSData *mil_build_qkv_weight_blob(const float *wq, const float *wk, const float *wv, int dim) { - NSUInteger wsize = (NSUInteger)dim * dim * 2; - NSUInteger cs = 64 + wsize; - NSUInteger total = 64 + 3 * cs; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0] = 0x01; buf[4] = 0x02; - const float *ws[3] = {wq, wk, wv}; - for (int w = 0; w < 3; w++) { - uint8_t *chunk = buf + 64 + w * cs; - chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; - chunk[4]=0x01; - *(uint32_t*)(chunk + 8) = (uint32_t)wsize; - *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset - _Float16 *fp16 = (_Float16*)(chunk + 64); - for (NSUInteger i = 0; i < (NSUInteger)dim * dim; i++) - fp16[i] = (_Float16)ws[w][i]; - } - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -// Build weight blob for fused FFN up (w1 + w3, both [hidden_dim, dim]) -static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, int hidden_dim, int dim) { - NSUInteger wsize = (NSUInteger)hidden_dim * dim * 2; - NSUInteger cs = 64 + wsize; - NSUInteger total = 64 + 2 * cs; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0] = 0x01; buf[4] = 0x02; - const float *ws[2] = {w1, w3}; - for (int w = 0; w < 2; w++) { - uint8_t *chunk = buf + 64 + w * cs; - chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; - chunk[4]=0x01; - *(uint32_t*)(chunk + 8) = (uint32_t)wsize; - *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset - _Float16 *fp16 = (_Float16*)(chunk + 64); - for (NSUInteger i = 0; i < (NSUInteger)hidden_dim * dim; i++) - fp16[i] = (_Float16)ws[w][i]; - } - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -// Generate MIL for fused FFN up: w1 + w3 parallel convs -static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) { - NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2; - return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor W1 = const()[name = string(\"W1\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " tensor W3 = const()[name = string(\"W3\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" - " tensor h1 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n" - " tensor h3 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n" - " tensor out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n" - " } -> (out1, out3);\n" - "}\n", - dim, spatial, dim, spatial, - hidden_dim, dim, hidden_dim, dim, - hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs), - hidden_dim, spatial, hidden_dim, spatial, - hidden_dim, spatial, hidden_dim, spatial]; -} +// ane_mil_gen.h — Generate MIL text for conv-based linear ops + weight blobs +#pragma once +#import +#include +#include +#include +#include "ane_compat.h" + +// Build an FP16 weight blob with the required header structure. +// weights_f32: source weights in row-major [out_ch, in_ch] +// Returns NSData with header + FP16 weights +static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int in_ch) { + NSUInteger wsize = (NSUInteger)out_ch * in_ch * 2; // FP16 + NSUInteger total = 64 + 64 + wsize; // global header + chunk header + data + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + uint8_t *chunk = buf + 64; + chunk[0] = 0xEF; chunk[1] = 0xBE; chunk[2] = 0xAD; chunk[3] = 0xDE; + chunk[4] = 0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize; // data_size + *(uint32_t*)(chunk + 16) = 128; // data_offset (from file start) + // Convert f32 → fp16 (simple truncation via _Float16) + _Float16 *fp16 = (_Float16*)(buf + 128); + for (NSUInteger i = 0; i < (NSUInteger)out_ch * in_ch; i++) + fp16[i] = (_Float16)weights_f32[i]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Generate MIL for a single matmul: y = W @ x (using matmul op, weights as input) +// Input x: [1, in_ch, spatial] fp32 +// Input W: [1, out_ch, in_ch] fp32 +// Output: [1, out_ch, spatial] fp32 +static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) { + return [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x, tensor W) {\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n" + " tensor W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n" + " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" + " bool ty = const()[name = string(\"ty\"), val = bool(false)];\n" + " tensor y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + " } -> (y);\n" + "}\n", + g_ane_platform.mil_program, ane_mil_target(), + in_ch, spatial, out_ch, in_ch, + in_ch, spatial, out_ch, in_ch, + out_ch, spatial, out_ch, spatial]; +} + +// Keep the baked-weight version for reference (used in inference-only scenarios) +static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) { + return [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x) {\n" + " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" + " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" + " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " tensor y16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + " } -> (y);\n" + "}\n", + g_ane_platform.mil_program, ane_mil_target(), + in_ch, spatial, in_ch, spatial, + out_ch, in_ch, out_ch, in_ch, + out_ch, spatial, out_ch, spatial]; +} + +// Generate MIL for fused QKV: 3 parallel convs from same input +// Input: [1, dim, 1, S] +// Outputs: Q[1, dim, 1, S], K[1, dim, 1, S], V[1, dim, 1, S] +// Weight blob layout: Wq[dim,dim] @ offset 64, Wk @ offset 64+cs, Wv @ offset 64+2*cs +// where cs = 64 + dim*dim*2 +static NSString *mil_gen_qkv(int dim, int spatial) { + NSUInteger cs = 64 + (NSUInteger)dim * dim * 2; + return [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x) {\n" + " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" + " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" + " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" + " tensor Wq = const()[name = string(\"Wq\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " tensor Wk = const()[name = string(\"Wk\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " tensor Wv = const()[name = string(\"Wv\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " tensor q16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n" + " tensor k16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n" + " tensor v16 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n" + " tensor k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n" + " tensor v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n" + " } -> (q, k, v);\n" + "}\n", + g_ane_platform.mil_program, ane_mil_target(), + dim, spatial, dim, spatial, + dim, dim, dim, dim, + dim, dim, dim, dim, (unsigned long)(64 + cs), + dim, dim, dim, dim, (unsigned long)(64 + 2*cs), + dim, spatial, dim, spatial, dim, spatial, + dim, spatial, dim, spatial, dim, spatial]; +} + +// Build weight blob for fused QKV (3 weight matrices concatenated) +static NSData *mil_build_qkv_weight_blob(const float *wq, const float *wk, const float *wv, int dim) { + NSUInteger wsize = (NSUInteger)dim * dim * 2; + NSUInteger cs = 64 + wsize; + NSUInteger total = 64 + 3 * cs; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + const float *ws[3] = {wq, wk, wv}; + for (int w = 0; w < 3; w++) { + uint8_t *chunk = buf + 64 + w * cs; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; + chunk[4]=0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize; + *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset + _Float16 *fp16 = (_Float16*)(chunk + 64); + for (NSUInteger i = 0; i < (NSUInteger)dim * dim; i++) + fp16[i] = (_Float16)ws[w][i]; + } + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Build weight blob for fused FFN up (w1 + w3, both [hidden_dim, dim]) +static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, int hidden_dim, int dim) { + NSUInteger wsize = (NSUInteger)hidden_dim * dim * 2; + NSUInteger cs = 64 + wsize; + NSUInteger total = 64 + 2 * cs; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + const float *ws[2] = {w1, w3}; + for (int w = 0; w < 2; w++) { + uint8_t *chunk = buf + 64 + w * cs; + chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE; + chunk[4]=0x01; + *(uint32_t*)(chunk + 8) = (uint32_t)wsize; + *(uint32_t*)(chunk + 16) = (uint32_t)(64 + w * cs + 64); // absolute data offset + _Float16 *fp16 = (_Float16*)(chunk + 64); + for (NSUInteger i = 0; i < (NSUInteger)hidden_dim * dim; i++) + fp16[i] = (_Float16)ws[w][i]; + } + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Generate MIL for fused FFN up: w1 + w3 parallel convs +static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) { + NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2; + return [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x) {\n" + " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" + " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" + " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" + " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" + " tensor W1 = const()[name = string(\"W1\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " tensor W3 = const()[name = string(\"W3\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " tensor h1 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n" + " tensor h3 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n" + " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" + " tensor out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n" + " tensor out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n" + " } -> (out1, out3);\n" + "}\n", + g_ane_platform.mil_program, ane_mil_target(), + dim, spatial, dim, spatial, + hidden_dim, dim, hidden_dim, dim, + hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs), + hidden_dim, spatial, hidden_dim, spatial, + hidden_dim, spatial, hidden_dim, spatial]; +} diff --git a/training/stories_config.h b/training/stories_config.h index f967974..d55e115 100644 --- a/training/stories_config.h +++ b/training/stories_config.h @@ -1,189 +1,190 @@ -// stories_config.h — Stories110M model config and structures -#pragma once -#import -#import -#import -#import -#import -#import -#import -#include -#include -#include -#include -#include -#include - -// Stories110M config -#define DIM 768 -#define HIDDEN 2048 -#define HEADS 12 -#define HD (DIM/HEADS) -#define SEQ 256 -#define NLAYERS 12 -#define VOCAB 32000 -#define ACCUM_STEPS 10 -#define MAX_COMPILES 100 - -// Per compile: 5 weight-bearing kernels per layer + 1 classifier = 5*12+1 = 61 -// Plus 1 static (sdpaBwd2 per layer, no weights) = 12 more but those are weight-free -// Actually sdpaBwd2 has no weights, compile once per layer -// Weight-bearing: fwdAttn(1) + fwdFFN(1) + ffnBwd(1) + sdpaBwd1(1) + qkvBwd(1) = 5 per layer -// 5 * 12 = 60 weight-bearing compiles per batch -// With MAX_COMPILES=100, we get 1 batch of ACCUM_STEPS before restart -#define KERNELS_PER_LAYER 5 -#define TOTAL_WEIGHT_KERNELS (KERNELS_PER_LAYER * NLAYERS) - -// Attention score channels for SDPA backward -#define SCORE_CH (HEADS*SEQ) - -// Weight sizes per layer -#define WQ_SZ (DIM*DIM) -#define WO_SZ (DIM*DIM) -#define W1_SZ (HIDDEN*DIM) -#define W2_SZ (DIM*HIDDEN) -#define W3_SZ (HIDDEN*DIM) -#define LAYER_PARAMS (4*WQ_SZ + W1_SZ + W2_SZ + W3_SZ + 2*DIM) -#define TOTAL_PARAMS (NLAYERS * LAYER_PARAMS + DIM + VOCAB*DIM) // +rms_final+embed - -// Per-layer weight and optimizer state -typedef struct { - float *Wq, *Wk, *Wv, *Wo; - float *W1, *W2, *W3; - float *rms_att, *rms_ffn; -} LayerWeights; - -typedef struct { - float *m, *v; - size_t n; -} AdamState; - -typedef struct { - AdamState Wq, Wk, Wv, Wo; - AdamState W1, W2, W3; - AdamState rms_att, rms_ffn; -} LayerAdam; - -// Per-layer activation buffers (saved for backward) -typedef struct { - float *layer_in; // [DIM, SEQ] input to this layer (for rmsnorm1 bwd) - float *xnorm; // [DIM, SEQ] rmsnorm1 output - float *Q, *K, *V; // [DIM, SEQ] QKV projections - float *attn_out; // [DIM, SEQ] attention output (before Wo) - float *o_out; // [DIM, SEQ] Wo output - float *x2; // [DIM, SEQ] residual after attn - float *x2norm; // [DIM, SEQ] rmsnorm2 output - float *h1, *h3; // [HIDDEN, SEQ] FFN intermediates - float *silu_out; // [HIDDEN, SEQ] SiLU(h1)*h3 - float *ffn_out; // [DIM, SEQ] FFN output -} LayerActs; - -// Per-layer gradient accumulators -typedef struct { - float *Wq, *Wk, *Wv, *Wo; - float *W1, *W2, *W3; - float *rms_att, *rms_ffn; -} LayerGrads; - -// ANE kernels per layer -typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern; -typedef struct { - Kern *fwdAttn, *fwdFFN, *ffnBwd, *sdpaBwd1, *sdpaBwd2, *qkvBwd; -} LayerKernels; - -// Checkpoint header -typedef struct { - int magic; // 0x424C5A54 "BLZT" - int version; // 2 - int step, total_steps; - int n_layers, vocab_size, dim, hidden_dim, n_heads, seq_len; - float lr, loss; - double cum_compile, cum_train, cum_wall; - int cum_steps, cum_batches; - int adam_t; - int pad[3]; // alignment -} CkptHdr; - -// llama2.c model file header -typedef struct { - int dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len; -} Llama2Config; - -// Globals -static Class g_D, g_I, g_AR, g_AIO; -static mach_timebase_info_data_t g_tb; -static int g_compile_count = 0; - -static void ane_init(void) { - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - g_I = NSClassFromString(@"_ANEInMemoryModel"); - g_AR = NSClassFromString(@"_ANERequest"); - g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); -} -static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } - -// Alloc helpers -static AdamState adam_alloc(size_t n) { AdamState s; s.m=(float*)calloc(n,4); s.v=(float*)calloc(n,4); s.n=n; return s; } -static void adam_free(AdamState *s) { free(s->m); free(s->v); } - -static LayerWeights layer_weights_alloc(void) { - LayerWeights w; - w.Wq=(float*)malloc(WQ_SZ*4); w.Wk=(float*)malloc(WQ_SZ*4); - w.Wv=(float*)malloc(WQ_SZ*4); w.Wo=(float*)malloc(WO_SZ*4); - w.W1=(float*)malloc(W1_SZ*4); w.W2=(float*)malloc(W2_SZ*4); w.W3=(float*)malloc(W3_SZ*4); - w.rms_att=(float*)malloc(DIM*4); w.rms_ffn=(float*)malloc(DIM*4); - return w; -} -static void layer_weights_free(LayerWeights *w) { - free(w->Wq);free(w->Wk);free(w->Wv);free(w->Wo); - free(w->W1);free(w->W2);free(w->W3); - free(w->rms_att);free(w->rms_ffn); -} -static LayerAdam layer_adam_alloc(void) { - LayerAdam a; - a.Wq=adam_alloc(WQ_SZ); a.Wk=adam_alloc(WQ_SZ); a.Wv=adam_alloc(WQ_SZ); a.Wo=adam_alloc(WO_SZ); - a.W1=adam_alloc(W1_SZ); a.W2=adam_alloc(W2_SZ); a.W3=adam_alloc(W3_SZ); - a.rms_att=adam_alloc(DIM); a.rms_ffn=adam_alloc(DIM); - return a; -} -static void layer_adam_free(LayerAdam *a) { - adam_free(&a->Wq);adam_free(&a->Wk);adam_free(&a->Wv);adam_free(&a->Wo); - adam_free(&a->W1);adam_free(&a->W2);adam_free(&a->W3); - adam_free(&a->rms_att);adam_free(&a->rms_ffn); -} -static LayerActs layer_acts_alloc(void) { - LayerActs a; - a.layer_in=(float*)malloc(SEQ*DIM*4); - a.xnorm=(float*)malloc(SEQ*DIM*4); a.Q=(float*)malloc(SEQ*DIM*4); - a.K=(float*)malloc(SEQ*DIM*4); a.V=(float*)malloc(SEQ*DIM*4); - a.attn_out=(float*)malloc(SEQ*DIM*4); a.o_out=(float*)malloc(SEQ*DIM*4); - a.x2=(float*)malloc(SEQ*DIM*4); a.x2norm=(float*)malloc(SEQ*DIM*4); - a.h1=(float*)malloc(SEQ*HIDDEN*4); a.h3=(float*)malloc(SEQ*HIDDEN*4); - a.silu_out=(float*)malloc(SEQ*HIDDEN*4); a.ffn_out=(float*)malloc(SEQ*DIM*4); - return a; -} -static void layer_acts_free(LayerActs *a) { - free(a->layer_in);free(a->xnorm);free(a->Q);free(a->K);free(a->V); - free(a->attn_out);free(a->o_out);free(a->x2);free(a->x2norm); - free(a->h1);free(a->h3);free(a->silu_out);free(a->ffn_out); -} -static LayerGrads layer_grads_alloc(void) { - LayerGrads g; - g.Wq=(float*)calloc(WQ_SZ,4); g.Wk=(float*)calloc(WQ_SZ,4); - g.Wv=(float*)calloc(WQ_SZ,4); g.Wo=(float*)calloc(WO_SZ,4); - g.W1=(float*)calloc(W1_SZ,4); g.W2=(float*)calloc(W2_SZ,4); g.W3=(float*)calloc(W3_SZ,4); - g.rms_att=(float*)calloc(DIM,4); g.rms_ffn=(float*)calloc(DIM,4); - return g; -} -static void layer_grads_zero(LayerGrads *g) { - memset(g->Wq,0,WQ_SZ*4);memset(g->Wk,0,WQ_SZ*4); - memset(g->Wv,0,WQ_SZ*4);memset(g->Wo,0,WO_SZ*4); - memset(g->W1,0,W1_SZ*4);memset(g->W2,0,W2_SZ*4);memset(g->W3,0,W3_SZ*4); - memset(g->rms_att,0,DIM*4);memset(g->rms_ffn,0,DIM*4); -} -static void layer_grads_free(LayerGrads *g) { - free(g->Wq);free(g->Wk);free(g->Wv);free(g->Wo); - free(g->W1);free(g->W2);free(g->W3); - free(g->rms_att);free(g->rms_ffn); -} +// stories_config.h — Stories110M model config and structures +#pragma once +#import +#import +#import +#import +#import +#import +#import +#include +#include +#include +#include +#include +#include +#include "ane_compat.h" + +// Stories110M config +#define DIM 768 +#define HIDDEN 2048 +#define HEADS 12 +#define HD (DIM/HEADS) +#define SEQ 256 +#define NLAYERS 12 +#define VOCAB 32000 +#define ACCUM_STEPS 10 +#define MAX_COMPILES 100 + +// Per compile: 5 weight-bearing kernels per layer + 1 classifier = 5*12+1 = 61 +// Plus 1 static (sdpaBwd2 per layer, no weights) = 12 more but those are weight-free +// Actually sdpaBwd2 has no weights, compile once per layer +// Weight-bearing: fwdAttn(1) + fwdFFN(1) + ffnBwd(1) + sdpaBwd1(1) + qkvBwd(1) = 5 per layer +// 5 * 12 = 60 weight-bearing compiles per batch +// With MAX_COMPILES=100, we get 1 batch of ACCUM_STEPS before restart +#define KERNELS_PER_LAYER 5 +#define TOTAL_WEIGHT_KERNELS (KERNELS_PER_LAYER * NLAYERS) + +// Attention score channels for SDPA backward +#define SCORE_CH (HEADS*SEQ) + +// Weight sizes per layer +#define WQ_SZ (DIM*DIM) +#define WO_SZ (DIM*DIM) +#define W1_SZ (HIDDEN*DIM) +#define W2_SZ (DIM*HIDDEN) +#define W3_SZ (HIDDEN*DIM) +#define LAYER_PARAMS (4*WQ_SZ + W1_SZ + W2_SZ + W3_SZ + 2*DIM) +#define TOTAL_PARAMS (NLAYERS * LAYER_PARAMS + DIM + VOCAB*DIM) // +rms_final+embed + +// Per-layer weight and optimizer state +typedef struct { + float *Wq, *Wk, *Wv, *Wo; + float *W1, *W2, *W3; + float *rms_att, *rms_ffn; +} LayerWeights; + +typedef struct { + float *m, *v; + size_t n; +} AdamState; + +typedef struct { + AdamState Wq, Wk, Wv, Wo; + AdamState W1, W2, W3; + AdamState rms_att, rms_ffn; +} LayerAdam; + +// Per-layer activation buffers (saved for backward) +typedef struct { + float *layer_in; // [DIM, SEQ] input to this layer (for rmsnorm1 bwd) + float *xnorm; // [DIM, SEQ] rmsnorm1 output + float *Q, *K, *V; // [DIM, SEQ] QKV projections + float *attn_out; // [DIM, SEQ] attention output (before Wo) + float *o_out; // [DIM, SEQ] Wo output + float *x2; // [DIM, SEQ] residual after attn + float *x2norm; // [DIM, SEQ] rmsnorm2 output + float *h1, *h3; // [HIDDEN, SEQ] FFN intermediates + float *silu_out; // [HIDDEN, SEQ] SiLU(h1)*h3 + float *ffn_out; // [DIM, SEQ] FFN output +} LayerActs; + +// Per-layer gradient accumulators +typedef struct { + float *Wq, *Wk, *Wv, *Wo; + float *W1, *W2, *W3; + float *rms_att, *rms_ffn; +} LayerGrads; + +// ANE kernels per layer +typedef struct { void *model; IOSurfaceRef ioIn, ioOut; void *request; void *tmpDir; } Kern; +typedef struct { + Kern *fwdAttn, *fwdFFN, *ffnBwd, *sdpaBwd1, *sdpaBwd2, *qkvBwd; +} LayerKernels; + +// Checkpoint header +typedef struct { + int magic; // 0x424C5A54 "BLZT" + int version; // 2 + int step, total_steps; + int n_layers, vocab_size, dim, hidden_dim, n_heads, seq_len; + float lr, loss; + double cum_compile, cum_train, cum_wall; + int cum_steps, cum_batches; + int adam_t; + int pad[3]; // alignment +} CkptHdr; + +// llama2.c model file header +typedef struct { + int dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len; +} Llama2Config; + +// Globals +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; +static int g_compile_count = 0; + +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +// Alloc helpers +static AdamState adam_alloc(size_t n) { AdamState s; s.m=(float*)calloc(n,4); s.v=(float*)calloc(n,4); s.n=n; return s; } +static void adam_free(AdamState *s) { free(s->m); free(s->v); } + +static LayerWeights layer_weights_alloc(void) { + LayerWeights w; + w.Wq=(float*)malloc(WQ_SZ*4); w.Wk=(float*)malloc(WQ_SZ*4); + w.Wv=(float*)malloc(WQ_SZ*4); w.Wo=(float*)malloc(WO_SZ*4); + w.W1=(float*)malloc(W1_SZ*4); w.W2=(float*)malloc(W2_SZ*4); w.W3=(float*)malloc(W3_SZ*4); + w.rms_att=(float*)malloc(DIM*4); w.rms_ffn=(float*)malloc(DIM*4); + return w; +} +static void layer_weights_free(LayerWeights *w) { + free(w->Wq);free(w->Wk);free(w->Wv);free(w->Wo); + free(w->W1);free(w->W2);free(w->W3); + free(w->rms_att);free(w->rms_ffn); +} +static LayerAdam layer_adam_alloc(void) { + LayerAdam a; + a.Wq=adam_alloc(WQ_SZ); a.Wk=adam_alloc(WQ_SZ); a.Wv=adam_alloc(WQ_SZ); a.Wo=adam_alloc(WO_SZ); + a.W1=adam_alloc(W1_SZ); a.W2=adam_alloc(W2_SZ); a.W3=adam_alloc(W3_SZ); + a.rms_att=adam_alloc(DIM); a.rms_ffn=adam_alloc(DIM); + return a; +} +static void layer_adam_free(LayerAdam *a) { + adam_free(&a->Wq);adam_free(&a->Wk);adam_free(&a->Wv);adam_free(&a->Wo); + adam_free(&a->W1);adam_free(&a->W2);adam_free(&a->W3); + adam_free(&a->rms_att);adam_free(&a->rms_ffn); +} +static LayerActs layer_acts_alloc(void) { + LayerActs a; + a.layer_in=(float*)malloc(SEQ*DIM*4); + a.xnorm=(float*)malloc(SEQ*DIM*4); a.Q=(float*)malloc(SEQ*DIM*4); + a.K=(float*)malloc(SEQ*DIM*4); a.V=(float*)malloc(SEQ*DIM*4); + a.attn_out=(float*)malloc(SEQ*DIM*4); a.o_out=(float*)malloc(SEQ*DIM*4); + a.x2=(float*)malloc(SEQ*DIM*4); a.x2norm=(float*)malloc(SEQ*DIM*4); + a.h1=(float*)malloc(SEQ*HIDDEN*4); a.h3=(float*)malloc(SEQ*HIDDEN*4); + a.silu_out=(float*)malloc(SEQ*HIDDEN*4); a.ffn_out=(float*)malloc(SEQ*DIM*4); + return a; +} +static void layer_acts_free(LayerActs *a) { + free(a->layer_in);free(a->xnorm);free(a->Q);free(a->K);free(a->V); + free(a->attn_out);free(a->o_out);free(a->x2);free(a->x2norm); + free(a->h1);free(a->h3);free(a->silu_out);free(a->ffn_out); +} +static LayerGrads layer_grads_alloc(void) { + LayerGrads g; + g.Wq=(float*)calloc(WQ_SZ,4); g.Wk=(float*)calloc(WQ_SZ,4); + g.Wv=(float*)calloc(WQ_SZ,4); g.Wo=(float*)calloc(WO_SZ,4); + g.W1=(float*)calloc(W1_SZ,4); g.W2=(float*)calloc(W2_SZ,4); g.W3=(float*)calloc(W3_SZ,4); + g.rms_att=(float*)calloc(DIM,4); g.rms_ffn=(float*)calloc(DIM,4); + return g; +} +static void layer_grads_zero(LayerGrads *g) { + memset(g->Wq,0,WQ_SZ*4);memset(g->Wk,0,WQ_SZ*4); + memset(g->Wv,0,WQ_SZ*4);memset(g->Wo,0,WO_SZ*4); + memset(g->W1,0,W1_SZ*4);memset(g->W2,0,W2_SZ*4);memset(g->W3,0,W3_SZ*4); + memset(g->rms_att,0,DIM*4);memset(g->rms_ffn,0,DIM*4); +} +static void layer_grads_free(LayerGrads *g) { + free(g->Wq);free(g->Wk);free(g->Wv);free(g->Wo); + free(g->W1);free(g->W2);free(g->W3); + free(g->rms_att);free(g->rms_ffn); +} diff --git a/training/stories_mil.h b/training/stories_mil.h index dccca44..1ca063a 100644 --- a/training/stories_mil.h +++ b/training/stories_mil.h @@ -1,286 +1,286 @@ -// stories_mil.h — MIL program generators for ANE kernels -// Same architecture as single-layer train_large.m but parameterized -#pragma once -#include "stories_io.h" - -#define MIL_HDR \ - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ - "{\"coremltools-version\", \"9.0\"}})]\n{\n" -#define CONV_CONST \ - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \ - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" \ - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" \ - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" \ - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - -// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm) -static NSString *gen_sdpa_fwd_taps(void) { - float sc = 1.0f/sqrtf((float)HD); - float invd = 1.0f/(float)DIM; - NSMutableString *m = [NSMutableString string]; - [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; - [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; - [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; - [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; - [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; - [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; - [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; - [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; - [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; - [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM]; - [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; - [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor Wq = const()[name=string(\"Wq\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wk = const()[name=string(\"Wk\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wv = const()[name=string(\"Wv\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wo = const()[name=string(\"Wo\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor qsh = const()[name=string(\"qsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; - [m appendString:@" bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"]; - [m appendString:@" bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; - [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; - [m appendFormat:@" tensor aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor os = const()[name=string(\"os\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ]; - [m appendString:@" } -> (out);\n}\n"]; - return m; -} - -// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm) -static NSString *gen_ffn_fwd_taps(void) { - float invd = 1.0f/(float)DIM; - NSMutableString *m = [NSMutableString string]; - [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; - [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; - [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; - [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; - [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; - [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; - [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; - [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; - [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; - [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM]; - [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; - [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor W1 = const()[name=string(\"W1\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; - [m appendFormat:@" tensor W3 = const()[name=string(\"W3\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; - [m appendFormat:@" tensor W2 = const()[name=string(\"W2\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN]; - [m appendFormat:@" tensor h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ]; - [m appendString:@" } -> (out);\n}\n"]; - return m; -} - -// FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3) -static NSString *gen_ffn_bwd(void) { - NSMutableString *m = [NSMutableString string]; - [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM+2*HIDDEN, SEQ]; - [m appendString:@CONV_CONST]; - [m appendString:@" tensor bd = const()[name=string(\"bd\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor sd = const()[name=string(\"sd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendFormat:@" tensor dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor s1 = const()[name=string(\"s1\"), val=tensor([1,%d,1,%d])];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", DIM+HIDDEN]; - [m appendFormat:@" tensor h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor W2t = const()[name=string(\"W2t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM]; - [m appendFormat:@" tensor dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ]; - [m appendString:@" fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"]; - [m appendFormat:@" tensor oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor W1t = const()[name=string(\"W1t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; - [m appendFormat:@" tensor W3t = const()[name=string(\"W3t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; - [m appendFormat:@" tensor dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ]; - [m appendString:@" } -> (out);\n}\n"]; - return m; -} - -// QKV backward: concat(dq,dk,dv) → dx -static NSString *gen_qkvb(void) { - NSMutableString *m = [NSMutableString string]; - [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", 3*DIM, SEQ]; - [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; - [m appendFormat:@" tensor dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor Wqt = const()[name=string(\"Wqt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wkt = const()[name=string(\"Wkt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wvt = const()[name=string(\"Wvt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ]; - [m appendString:@" } -> (out);\n}\n"]; - return m; -} - -// SDPA backward part 1 + Wo^T -static NSString *gen_sdpa_bwd1(void) { - float sc = 1.0f/sqrtf((float)HD); - NSMutableString *m = [NSMutableString string]; - [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", 4*DIM, SEQ]; - [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; - [m appendFormat:@" tensor vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 3*DIM]; - [m appendFormat:@" tensor dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor Wot = const()[name=string(\"Wot\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD]; - [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; - [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; - [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; - [m appendFormat:@" tensor probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor dvs = const()[name=string(\"dvs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor scs = const()[name=string(\"scs\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ]; - [m appendString:@" } -> (out);\n}\n"]; - return m; -} - -// SDPA backward part 2: concat(probs,dp,Q,K) → concat(dQ,dK) -static NSString *gen_sdpa_bwd2(void) { - float sc = 1.0f/sqrtf((float)HD); - int bwd2_in = 2*SCORE_CH + 2*DIM; - NSMutableString *m = [NSMutableString string]; - [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", bwd2_in, SEQ]; - [m appendFormat:@" tensor sz_sc = const()[name=string(\"szsc\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", SCORE_CH]; - [m appendFormat:@" tensor dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor sz_d = const()[name=string(\"szd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH]; - [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH+DIM]; - [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor ssh = const()[name=string(\"ssh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" tensor rax = const()[name=string(\"rax\"), val=tensor([-1])];\n"]; - [m appendString:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ]; - [m appendFormat:@" tensor dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; - [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor fs = const()[name=string(\"fs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ]; - [m appendString:@" } -> (out);\n}\n"]; - return m; -} - -// Mask blob (causal mask [SEQ,SEQ]) -static NSData *g_mask_blob = nil; -static NSData *get_mask_blob(void) { - if (!g_mask_blob) { - _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16)); - for(int t=0;t st = const()[name=string(\"st\"), val=tensor([1,1])];\n" \ + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" \ + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" \ + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + +// SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm) +static NSString *gen_sdpa_fwd_taps(void) { + float sc = 1.0f/sqrtf((float)HD); + float invd = 1.0f/(float)DIM; + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main<%s>(tensor x) {\n", ane_mil_target(), DIM, SEQ]; + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; + [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; + [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; + [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor Wq = const()[name=string(\"Wq\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wk = const()[name=string(\"Wk\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wv = const()[name=string(\"Wv\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wo = const()[name=string(\"Wo\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor qsh = const()[name=string(\"qsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendString:@" bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"]; + [m appendString:@" bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; + [m appendFormat:@" tensor aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor os = const()[name=string(\"os\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// FFN forward + taps: x2 → rmsnorm → FFN → concat(ffn_out, h1, h3, silu_out, x2norm) +static NSString *gen_ffn_fwd_taps(void) { + float invd = 1.0f/(float)DIM; + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main<%s>(tensor x) {\n", ane_mil_target(), DIM, SEQ]; + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; + [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; + [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; + [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor W1 = const()[name=string(\"W1\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W3 = const()[name=string(\"W3\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W2 = const()[name=string(\"W2\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN]; + [m appendFormat:@" tensor h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// FFN backward: concat(dffn,h1,h3) → concat(dx,dh1,dh3) +static NSString *gen_ffn_bwd(void) { + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main<%s>(tensor x) {\n", ane_mil_target(), DIM+2*HIDDEN, SEQ]; + [m appendString:@CONV_CONST]; + [m appendString:@" tensor bd = const()[name=string(\"bd\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor sd = const()[name=string(\"sd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor s1 = const()[name=string(\"s1\"), val=tensor([1,%d,1,%d])];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", DIM+HIDDEN]; + [m appendFormat:@" tensor h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W2t = const()[name=string(\"W2t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM]; + [m appendFormat:@" tensor dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ]; + [m appendString:@" fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"]; + [m appendFormat:@" tensor oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W1t = const()[name=string(\"W1t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor W3t = const()[name=string(\"W3t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// QKV backward: concat(dq,dk,dv) → dx +static NSString *gen_qkvb(void) { + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main<%s>(tensor x) {\n", ane_mil_target(), 3*DIM, SEQ]; + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor Wqt = const()[name=string(\"Wqt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wkt = const()[name=string(\"Wkt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wvt = const()[name=string(\"Wvt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// SDPA backward part 1 + Wo^T +static NSString *gen_sdpa_bwd1(void) { + float sc = 1.0f/sqrtf((float)HD); + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main<%s>(tensor x) {\n", ane_mil_target(), 4*DIM, SEQ]; + [m appendString:@CONV_CONST]; + [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 3*DIM]; + [m appendFormat:@" tensor dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor Wot = const()[name=string(\"Wot\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD]; + [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; + [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; + [m appendFormat:@" tensor probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dvs = const()[name=string(\"dvs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor scs = const()[name=string(\"scs\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// SDPA backward part 2: concat(probs,dp,Q,K) → concat(dQ,dK) +static NSString *gen_sdpa_bwd2(void) { + float sc = 1.0f/sqrtf((float)HD); + int bwd2_in = 2*SCORE_CH + 2*DIM; + NSMutableString *m = [NSMutableString string]; + [m appendString:MIL_HDR]; + [m appendFormat:@" func main<%s>(tensor x) {\n", ane_mil_target(), bwd2_in, SEQ]; + [m appendFormat:@" tensor sz_sc = const()[name=string(\"szsc\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH, SEQ]; + [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", SCORE_CH]; + [m appendFormat:@" tensor dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor sz_d = const()[name=string(\"szd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH+DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor ssh = const()[name=string(\"ssh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor rax = const()[name=string(\"rax\"), val=tensor([-1])];\n"]; + [m appendString:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ]; + [m appendFormat:@" tensor dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; + [m appendFormat:@" tensor ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; + [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; + [m appendFormat:@" tensor dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor fs = const()[name=string(\"fs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ]; + [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; + [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ]; + [m appendString:@" } -> (out);\n}\n"]; + return m; +} + +// Mask blob (causal mask [SEQ,SEQ]) +static NSData *g_mask_blob = nil; +static NSData *get_mask_blob(void) { + if (!g_mask_blob) { + _Float16 *mask = (_Float16*)calloc(SEQ*SEQ, sizeof(_Float16)); + for(int t=0;t -#import -#import -#import -#import -#import -#include - -static mach_timebase_info_data_t g_tb; -static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } - -static void dump_class(const char *name) { - Class cls = NSClassFromString([NSString stringWithUTF8String:name]); - if (!cls) { printf(" %s: NOT FOUND\n", name); return; } - printf("\n=== %s ===\n", name); - unsigned int count; - Method *methods = class_copyMethodList(object_getClass(cls), &count); - if (count) printf(" Class methods:\n"); - for (unsigned int i = 0; i < count; i++) { - SEL s = method_getName(methods[i]); - const char *enc = method_getTypeEncoding(methods[i]); - printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); - } - free(methods); - methods = class_copyMethodList(cls, &count); - if (count) printf(" Instance methods:\n"); - for (unsigned int i = 0; i < count; i++) { - SEL s = method_getName(methods[i]); - const char *enc = method_getTypeEncoding(methods[i]); - printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); - } - free(methods); - unsigned int pcount; - objc_property_t *props = class_copyPropertyList(cls, &pcount); - if (pcount) printf(" Properties:\n"); - for (unsigned int i = 0; i < pcount; i++) { - const char *pname = property_getName(props[i]); - const char *pattr = property_getAttributes(props[i]); - printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); - } - free(props); -} - -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -int main() { - @autoreleasepool { - setbuf(stdout, NULL); - mach_timebase_info(&g_tb); - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - - printf("=== ANE Advanced Interface Probe ===\n"); - - // === Part 1: Event/Sync classes === - printf("\n--- Part 1: Event/Sync Classes ---\n"); - dump_class("_ANESharedEvents"); - dump_class("_ANESharedSignalEvent"); - dump_class("_ANESharedWaitEvent"); - dump_class("_ANEEvent"); - dump_class("_ANEFenceEvent"); - - const char *event_classes[] = { - "_ANESharedEvents", "_ANESharedSignalEvent", "_ANESharedWaitEvent", - "_ANEEvent", "_ANEFenceEvent", NULL - }; - for (int i = 0; event_classes[i]; i++) { - Class cls = NSClassFromString([NSString stringWithUTF8String:event_classes[i]]); - if (!cls) continue; - @try { - id obj = [[cls alloc] init]; - printf(" %s alloc/init: %s\n", event_classes[i], - obj ? [[obj description] UTF8String] : "nil"); - } @catch (NSException *ex) { - printf(" %s alloc/init: EXCEPTION: %s\n", event_classes[i], [[ex reason] UTF8String]); - } - } - - // === Part 2: VirtualClient and ChainingRequest === - printf("\n--- Part 2: VirtualClient / ChainingRequest ---\n"); - dump_class("_ANEVirtualClient"); - dump_class("_ANEChainingRequest"); - dump_class("_ANEMultiRequest"); - dump_class("_ANEBatchRequest"); - - // === Part 3: Compile working kernel for weightsBuffer + procedureIndex tests === - printf("\n--- Part 3: weightsBuffer IOSurface test ---\n"); - Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - Class g_I = NSClassFromString(@"_ANEInMemoryModel"); - Class g_AR = NSClassFromString(@"_ANERequest"); - Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); - - int CH = 64, SP = 32; - _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); - for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f; - int ws = CH*CH*2, tot = 128+ws; - uint8_t *blob = (uint8_t*)calloc(tot,1); - blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; - *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; - memcpy(blob+128, w, ws); - NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; - - NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), - md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; - [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] - withIntermediateDirectories:YES attributes:nil error:nil]; - [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - - NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - - int ioBytes = CH * SP * 4; - IOSurfaceRef ioIn = make_surface(ioBytes); - IOSurfaceRef ioOut = make_surface(ioBytes); - - IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f; - IOSurfaceUnlock(ioIn, 0, NULL); - - // Baseline eval - id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); - id req0 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req0, &e); - printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL"); - - IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut); - float baseline_0 = out0[0], baseline_1 = out0[1]; - printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]); - IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); - - // Test weightsBuffer: IOSurface with 3x identity weights - printf("\n Testing weightsBuffer IOSurface...\n"); - _Float16 *w3 = (_Float16*)calloc(CH*CH, sizeof(_Float16)); - for (int i = 0; i < CH; i++) w3[i*CH+i] = (_Float16)3.0f; - IOSurfaceRef ioW = make_surface(ws); - IOSurfaceLock(ioW, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(ioW), w3, ws); - IOSurfaceUnlock(ioW, 0, NULL); - free(w3); - id wW = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioW); - - wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); - wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); - id req_wb = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], wW, nil, @0); - printf(" Request with weightsBuffer: %s\n", req_wb ? "created" : "nil"); - - if (req_wb) { - ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_wb, &e); - printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL"); - if (ok) { - IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outW = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]); - bool changed = fabsf(outW[0] - baseline_0) > 0.001f; - bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f; - printf(" weightsBuffer: output %s", changed ? "CHANGED" : "unchanged"); - if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected"); - printf("\n"); - IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); - } - } - CFRelease(ioW); - - // === Part 4: procedureIndex sweep === - printf("\n--- Part 4: procedureIndex sweep (0-15) ---\n"); - for (int pi = 0; pi < 16; pi++) { - wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); - wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); - id req_p = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @(pi)); - if (!req_p) { printf(" procIdx %2d: request=nil\n", pi); continue; } - ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_p, &e); - printf(" procIdx %2d: %s%s\n", pi, ok ? "OK" : "FAIL", - !ok && e ? [NSString stringWithFormat:@" (%@)", [e localizedDescription]].UTF8String : ""); - } - - // === Part 5: Scan all ANE classes === - printf("\n--- Part 5: All ANE-prefixed classes ---\n"); - unsigned int classCount; - Class *allClasses = objc_copyClassList(&classCount); - for (unsigned int i = 0; i < classCount; i++) { - const char *name = class_getName(allClasses[i]); - if (strstr(name, "ANE") || strstr(name, "ane")) { - printf(" %s\n", name); - } - } - free(allClasses); - free(w); - - // Cleanup - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - [fm removeItemAtPath:td error:nil]; - CFRelease(ioIn); CFRelease(ioOut); - - printf("\nDone.\n"); - } - return 0; -} +// test_ane_advanced.m — Probe advanced ANE interfaces +// SharedEvents, weightsBuffer, procedureIndex, VirtualClient, ChainingRequest +#import +#import +#import +#import +#import +#import +#include +#include "ane_compat.h" + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static void dump_class(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) { printf(" %s: NOT FOUND\n", name); return; } + printf("\n=== %s ===\n", name); + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(cls), &count); + if (count) printf(" Class methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + methods = class_copyMethodList(cls, &count); + if (count) printf(" Instance methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + if (pcount) printf(" Properties:\n"); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + const char *pattr = property_getAttributes(props[i]); + printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); + } + free(props); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + ane_detect_platform(); + ane_print_platform(); + + printf("=== ANE Advanced Interface Probe ===\n"); + + // === Part 1: Event/Sync classes === + printf("\n--- Part 1: Event/Sync Classes ---\n"); + dump_class("_ANESharedEvents"); + dump_class("_ANESharedSignalEvent"); + dump_class("_ANESharedWaitEvent"); + dump_class("_ANEEvent"); + dump_class("_ANEFenceEvent"); + + const char *event_classes[] = { + "_ANESharedEvents", "_ANESharedSignalEvent", "_ANESharedWaitEvent", + "_ANEEvent", "_ANEFenceEvent", NULL + }; + for (int i = 0; event_classes[i]; i++) { + Class cls = NSClassFromString([NSString stringWithUTF8String:event_classes[i]]); + if (!cls) continue; + @try { + id obj = [[cls alloc] init]; + printf(" %s alloc/init: %s\n", event_classes[i], + obj ? [[obj description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s alloc/init: EXCEPTION: %s\n", event_classes[i], [[ex reason] UTF8String]); + } + } + + // === Part 2: VirtualClient and ChainingRequest === + printf("\n--- Part 2: VirtualClient / ChainingRequest ---\n"); + dump_class("_ANEVirtualClient"); + dump_class("_ANEChainingRequest"); + dump_class("_ANEMultiRequest"); + dump_class("_ANEBatchRequest"); + + // === Part 3: Compile working kernel for weightsBuffer + procedureIndex tests === + printf("\n--- Part 3: weightsBuffer IOSurface test ---\n"); + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + int CH = 64, SP = 32; + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f; + int ws = CH*CH*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot,1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, w, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" + " } -> (y);\n" + "}\n", g_ane_platform.mil_program, ane_mil_target(), CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + + int ioBytes = CH * SP * 4; + IOSurfaceRef ioIn = make_surface(ioBytes); + IOSurfaceRef ioOut = make_surface(ioBytes); + + IOSurfaceLock(ioIn, 0, NULL); + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f; + IOSurfaceUnlock(ioIn, 0, NULL); + + // Baseline eval + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req0 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req0, &e); + printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL"); + + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut); + float baseline_0 = out0[0], baseline_1 = out0[1]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]); + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + + // Test weightsBuffer: IOSurface with 3x identity weights + printf("\n Testing weightsBuffer IOSurface...\n"); + _Float16 *w3 = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w3[i*CH+i] = (_Float16)3.0f; + IOSurfaceRef ioW = make_surface(ws); + IOSurfaceLock(ioW, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(ioW), w3, ws); + IOSurfaceUnlock(ioW, 0, NULL); + free(w3); + id wW = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioW); + + wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req_wb = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], wW, nil, @0); + printf(" Request with weightsBuffer: %s\n", req_wb ? "created" : "nil"); + + if (req_wb) { + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_wb, &e); + printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL"); + if (ok) { + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + float *outW = (float*)IOSurfaceGetBaseAddress(ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]); + bool changed = fabsf(outW[0] - baseline_0) > 0.001f; + bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f; + printf(" weightsBuffer: output %s", changed ? "CHANGED" : "unchanged"); + if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected"); + printf("\n"); + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + } + } + CFRelease(ioW); + + // === Part 4: procedureIndex sweep === + printf("\n--- Part 4: procedureIndex sweep (0-15) ---\n"); + for (int pi = 0; pi < 16; pi++) { + wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req_p = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @(pi)); + if (!req_p) { printf(" procIdx %2d: request=nil\n", pi); continue; } + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req_p, &e); + printf(" procIdx %2d: %s%s\n", pi, ok ? "OK" : "FAIL", + !ok && e ? [NSString stringWithFormat:@" (%@)", [e localizedDescription]].UTF8String : ""); + } + + // === Part 5: Scan all ANE classes === + printf("\n--- Part 5: All ANE-prefixed classes ---\n"); + unsigned int classCount; + Class *allClasses = objc_copyClassList(&classCount); + for (unsigned int i = 0; i < classCount; i++) { + const char *name = class_getName(allClasses[i]); + if (strstr(name, "ANE") || strstr(name, "ane")) { + printf(" %s\n", name); + } + } + free(allClasses); + free(w); + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [fm removeItemAtPath:td error:nil]; + CFRelease(ioIn); CFRelease(ioOut); + + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_ane_causal_attn.m b/training/test_ane_causal_attn.m index cb9b761..55c381b 100644 --- a/training/test_ane_causal_attn.m +++ b/training/test_ane_causal_attn.m @@ -1,295 +1,301 @@ -// Decomposed causal attention: Q@K^T on ANE, mask+softmax on CPU, scores@V on ANE -// This gives us causal masking with ANE acceleration for the matmuls -#import -#import -#import -#import -#import -#include - -#define HEADS 12 -#define HD 64 -#define SEQ 64 - -static Class g_D, g_I, g_AR, g_AIO; -static mach_timebase_info_data_t g_tb; -static void ane_init(void) { - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - g_I = NSClassFromString(@"_ANEInMemoryModel"); - g_AR = NSClassFromString(@"_ANERequest"); - g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); -} -static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -typedef struct { id model; NSString *td; } Kern; - -static Kern compile_mil(NSString *mil) { - Kern k = {nil, nil}; - NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, @{}, nil); - if (!desc) { printf("desc=NULL\n"); return k; } - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - [[NSFileManager defaultManager] createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil]; - [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { - printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); - [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k; - } - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - k.model = mdl; k.td = td; - return k; -} - -static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) { - NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array]; - for (int i = 0; i < nin; i++) { - [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])]; - [inIdx addObject:@(i)]; - } - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out); - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - inArr, inIdx, @[wO], @[@0], nil, nil, @0); - NSError *e = nil; - return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); -} - -static void cleanup_kern(Kern *k) { - if (!k->model) return; - NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); - [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; -} - -int main() { - @autoreleasepool { - setbuf(stdout, NULL); - ane_init(); - mach_timebase_info(&g_tb); - - // === Approach 1: Non-causal SDPA (baseline) === - printf("=== Non-causal SDPA (baseline) ===\n"); - NSString *sdpa_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " - "tensor k, tensor v) {\n" - " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" - " } -> (att);\n}\n", - HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; - Kern kSDPA = compile_mil(sdpa_mil); - printf("SDPA compile: %s\n", kSDPA.model ? "OK" : "FAIL"); - - // === Approach 2: Decomposed causal via matmul ops === - // Step 1: Q @ K^T → scores [1, HEADS, SEQ, SEQ] - // MIL matmul: matmul(x=Q, y=K, transpose_y=true) - // Q shape: [1, HEADS, SEQ, HD], K shape: [1, HEADS, SEQ, HD] - // scores = Q @ K^T → [1, HEADS, SEQ, SEQ] - printf("\n=== Decomposed causal attention ===\n"); - NSString *qkt_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " - "tensor k) {\n" - " tensor scores = matmul(" - "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n" - " } -> (scores);\n}\n", - HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ]; - Kern kQKT = compile_mil(qkt_mil); - printf("Q@K^T compile: %s\n", kQKT.model ? "OK" : "FAIL"); - - // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD] - NSString *sv_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor s, " - "tensor v) {\n" - " tensor out = matmul(" - "x = s, y = v)[name = string(\"sv\")];\n" - " } -> (out);\n}\n", - HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD]; - Kern kSV = compile_mil(sv_mil); - printf("scores@V compile: %s\n", kSV.model ? "OK" : "FAIL"); - - if (!kSDPA.model || !kQKT.model || !kSV.model) { - printf("Some kernels failed to compile, aborting\n"); - goto done; - } - - // Generate test data - srand48(42); - int total_qkv = HEADS * SEQ * HD; - _Float16 *Q = (_Float16*)malloc(total_qkv * 2); - _Float16 *K = (_Float16*)malloc(total_qkv * 2); - _Float16 *V = (_Float16*)malloc(total_qkv * 2); - for (int i = 0; i < total_qkv; i++) { - Q[i] = (_Float16)(0.5f * (2*drand48()-1)); - K[i] = (_Float16)(0.5f * (2*drand48()-1)); - V[i] = (_Float16)(0.5f * (2*drand48()-1)); - } - - // IOSurfaces for Q, K, V - size_t qkv_bytes = total_qkv * 2; - IOSurfaceRef ioQ = make_surface(qkv_bytes), ioK = make_surface(qkv_bytes), ioV = make_surface(qkv_bytes); - IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, qkv_bytes); IOSurfaceUnlock(ioQ, 0, NULL); - IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, qkv_bytes); IOSurfaceUnlock(ioK, 0, NULL); - IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, qkv_bytes); IOSurfaceUnlock(ioV, 0, NULL); - - // Scores IOSurface: [1, HEADS, SEQ, SEQ] - int total_scores = HEADS * SEQ * SEQ; - size_t scores_bytes = total_scores * 2; - IOSurfaceRef ioScores = make_surface(scores_bytes); - IOSurfaceRef ioOut_sdpa = make_surface(qkv_bytes); - IOSurfaceRef ioOut_decomp = make_surface(qkv_bytes); - - // === Run non-causal SDPA === - { - IOSurfaceRef ins[] = {ioQ, ioK, ioV}; - if (!ane_eval(&kSDPA, ins, 3, ioOut_sdpa)) { printf("SDPA eval FAIL\n"); goto done; } - } - - // === Run decomposed causal === - // Step 1: Q@K^T on ANE - { - IOSurfaceRef ins[] = {ioQ, ioK}; - if (!ane_eval(&kQKT, ins, 2, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; } - } - - // Step 2: Scale + causal mask + softmax on CPU - { - IOSurfaceLock(ioScores, 0, NULL); - _Float16 *scores = (_Float16*)IOSurfaceGetBaseAddress(ioScores); - float scale = 1.0f / sqrtf((float)HD); - for (int h = 0; h < HEADS; h++) { - for (int t = 0; t < SEQ; t++) { - // Apply scale, causal mask, and softmax - float row[SEQ], maxs = -1e30f; - for (int t2 = 0; t2 < SEQ; t2++) { - float s = (float)scores[h*SEQ*SEQ + t*SEQ + t2] * scale; - if (t2 > t) s = -1e30f; // causal mask - row[t2] = s; - if (s > maxs) maxs = s; - } - float sum = 0; - for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2] - maxs); sum += row[t2]; } - for (int t2 = 0; t2 < SEQ; t2++) - scores[h*SEQ*SEQ + t*SEQ + t2] = (_Float16)(row[t2] / sum); - } - } - IOSurfaceUnlock(ioScores, 0, NULL); - } - - // Step 3: softmax_scores @ V on ANE - { - IOSurfaceRef ins[] = {ioScores, ioV}; - if (!ane_eval(&kSV, ins, 2, ioOut_decomp)) { printf("scores@V eval FAIL\n"); goto done; } - } - - // === Verify decomposed causal === - { - float scale = 1.0f / sqrtf((float)HD); - IOSurfaceLock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL); - _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut_decomp); - float maxdiff = 0; - for (int h = 0; h < HEADS; h++) - for (int t = 0; t < SEQ; t++) { - float scores[SEQ], maxs = -1e30f; - for (int t2 = 0; t2 <= t; t2++) { - float s = 0; - for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; - s *= scale; scores[t2] = s; if(s>maxs) maxs=s; - } - float sum = 0; - for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } - for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum; - for (int d = 0; d < HD; d++) { - float ref = 0; - for (int t2 = 0; t2 <= t; t2++) ref += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; - float diff = fabsf((float)out[h*SEQ*HD+t*HD+d] - ref); - if(diff>maxdiff) maxdiff=diff; - } - } - IOSurfaceUnlock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL); - printf("\nDecomposed causal max diff vs CPU ref: %.6f\n", maxdiff); - } - - // === Benchmark: SDPA vs decomposed === - printf("\n=== Benchmarks ===\n"); - int N = 500; - { - IOSurfaceRef ins[] = {ioQ, ioK, ioV}; - // Warmup - for (int i = 0; i < 10; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa); - uint64_t t0 = mach_absolute_time(); - for (int i = 0; i < N; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa); - double ms = tb_ms(mach_absolute_time() - t0); - double flops = 4.0 * HEADS * SEQ * SEQ * HD; - printf("SDPA (non-causal): %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6); - } - { - // Decomposed: QKT + CPU softmax + SV - // Warmup - for (int i = 0; i < 10; i++) { - IOSurfaceRef ins1[] = {ioQ, ioK}; - ane_eval(&kQKT, ins1, 2, ioScores); - // Skip CPU softmax in benchmark for ANE-only timing - IOSurfaceRef ins2[] = {ioScores, ioV}; - ane_eval(&kSV, ins2, 2, ioOut_decomp); - } - uint64_t t0 = mach_absolute_time(); - for (int i = 0; i < N; i++) { - IOSurfaceRef ins1[] = {ioQ, ioK}; - ane_eval(&kQKT, ins1, 2, ioScores); - // CPU softmax + causal mask - IOSurfaceLock(ioScores, 0, NULL); - _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores); - float scale = 1.0f / sqrtf((float)HD); - for (int h = 0; h < HEADS; h++) - for (int t = 0; t < SEQ; t++) { - float row[SEQ], maxs = -1e30f; - for (int t2 = 0; t2 < SEQ; t2++) { - float s = (float)sc[h*SEQ*SEQ+t*SEQ+t2] * scale; - if (t2 > t) s = -1e30f; - row[t2] = s; if(s>maxs) maxs=s; - } - float sum = 0; - for (int t2 = 0; t2 < SEQ; t2++) { row[t2]=expf(row[t2]-maxs); sum+=row[t2]; } - for (int t2 = 0; t2 < SEQ; t2++) - sc[h*SEQ*SEQ+t*SEQ+t2] = (_Float16)(row[t2]/sum); - } - IOSurfaceUnlock(ioScores, 0, NULL); - IOSurfaceRef ins2[] = {ioScores, ioV}; - ane_eval(&kSV, ins2, 2, ioOut_decomp); - } - double ms = tb_ms(mach_absolute_time() - t0); - double flops = 4.0 * HEADS * SEQ * SEQ * HD; - printf("Decomposed causal: %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6); - } - - CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV); - CFRelease(ioScores); CFRelease(ioOut_sdpa); CFRelease(ioOut_decomp); - free(Q); free(K); free(V); - - done: - cleanup_kern(&kSDPA); - cleanup_kern(&kQKT); - cleanup_kern(&kSV); - printf("\nDONE\n"); - } - return 0; -} +// Decomposed causal attention: Q@K^T on ANE, mask+softmax on CPU, scores@V on ANE +// This gives us causal masking with ANE acceleration for the matmuls +#import +#import +#import +#import +#import +#include +#include "ane_compat.h" + +#define HEADS 12 +#define HD 64 +#define SEQ 64 + +static Class g_D, g_I, g_AR, g_AIO; +static mach_timebase_info_data_t g_tb; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +typedef struct { id model; NSString *td; } Kern; + +static Kern compile_mil(NSString *mil) { + Kern k = {nil, nil}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, @{}, nil); + if (!desc) { printf("desc=NULL\n"); return k; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:td withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return k; + } + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + k.model = mdl; k.td = td; + return k; +} + +static BOOL ane_eval(Kern *k, IOSurfaceRef *ins, int nin, IOSurfaceRef out) { + NSMutableArray *inArr = [NSMutableArray array], *inIdx = [NSMutableArray array]; + for (int i = 0; i < nin; i++) { + [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ins[i])]; + [inIdx addObject:@(i)]; + } + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), out); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + inArr, inIdx, @[wO], @[@0], nil, nil, @0); + NSError *e = nil; + return ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); +} + +static void cleanup_kern(Kern *k) { + if (!k->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:k->td error:nil]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + ane_detect_platform(); + ane_print_platform(); + mach_timebase_info(&g_tb); + + // === Approach 1: Non-causal SDPA (baseline) === + printf("=== Non-causal SDPA (baseline) ===\n"); + NSString *sdpa_mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor q, " + "tensor k, tensor v) {\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; + Kern kSDPA = compile_mil(sdpa_mil); + printf("SDPA compile: %s\n", kSDPA.model ? "OK" : "FAIL"); + + // === Approach 2: Decomposed causal via matmul ops === + // Step 1: Q @ K^T → scores [1, HEADS, SEQ, SEQ] + // MIL matmul: matmul(x=Q, y=K, transpose_y=true) + // Q shape: [1, HEADS, SEQ, HD], K shape: [1, HEADS, SEQ, HD] + // scores = Q @ K^T → [1, HEADS, SEQ, SEQ] + printf("\n=== Decomposed causal attention ===\n"); + NSString *qkt_mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor q, " + "tensor k) {\n" + " tensor scores = matmul(" + "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n" + " } -> (scores);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ]; + Kern kQKT = compile_mil(qkt_mil); + printf("Q@K^T compile: %s\n", kQKT.model ? "OK" : "FAIL"); + + // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD] + NSString *sv_mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor s, " + "tensor v) {\n" + " tensor out = matmul(" + "x = s, y = v)[name = string(\"sv\")];\n" + " } -> (out);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD]; + Kern kSV = compile_mil(sv_mil); + printf("scores@V compile: %s\n", kSV.model ? "OK" : "FAIL"); + + if (!kSDPA.model || !kQKT.model || !kSV.model) { + printf("Some kernels failed to compile, aborting\n"); + goto done; + } + + // Generate test data + srand48(42); + int total_qkv = HEADS * SEQ * HD; + _Float16 *Q = (_Float16*)malloc(total_qkv * 2); + _Float16 *K = (_Float16*)malloc(total_qkv * 2); + _Float16 *V = (_Float16*)malloc(total_qkv * 2); + for (int i = 0; i < total_qkv; i++) { + Q[i] = (_Float16)(0.5f * (2*drand48()-1)); + K[i] = (_Float16)(0.5f * (2*drand48()-1)); + V[i] = (_Float16)(0.5f * (2*drand48()-1)); + } + + // IOSurfaces for Q, K, V + size_t qkv_bytes = total_qkv * 2; + IOSurfaceRef ioQ = make_surface(qkv_bytes), ioK = make_surface(qkv_bytes), ioV = make_surface(qkv_bytes); + IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, qkv_bytes); IOSurfaceUnlock(ioQ, 0, NULL); + IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, qkv_bytes); IOSurfaceUnlock(ioK, 0, NULL); + IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, qkv_bytes); IOSurfaceUnlock(ioV, 0, NULL); + + // Scores IOSurface: [1, HEADS, SEQ, SEQ] + int total_scores = HEADS * SEQ * SEQ; + size_t scores_bytes = total_scores * 2; + IOSurfaceRef ioScores = make_surface(scores_bytes); + IOSurfaceRef ioOut_sdpa = make_surface(qkv_bytes); + IOSurfaceRef ioOut_decomp = make_surface(qkv_bytes); + + // === Run non-causal SDPA === + { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + if (!ane_eval(&kSDPA, ins, 3, ioOut_sdpa)) { printf("SDPA eval FAIL\n"); goto done; } + } + + // === Run decomposed causal === + // Step 1: Q@K^T on ANE + { + IOSurfaceRef ins[] = {ioQ, ioK}; + if (!ane_eval(&kQKT, ins, 2, ioScores)) { printf("Q@K^T eval FAIL\n"); goto done; } + } + + // Step 2: Scale + causal mask + softmax on CPU + { + IOSurfaceLock(ioScores, 0, NULL); + _Float16 *scores = (_Float16*)IOSurfaceGetBaseAddress(ioScores); + float scale = 1.0f / sqrtf((float)HD); + for (int h = 0; h < HEADS; h++) { + for (int t = 0; t < SEQ; t++) { + // Apply scale, causal mask, and softmax + float row[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + float s = (float)scores[h*SEQ*SEQ + t*SEQ + t2] * scale; + if (t2 > t) s = -1e30f; // causal mask + row[t2] = s; + if (s > maxs) maxs = s; + } + float sum = 0; + for (int t2 = 0; t2 < SEQ; t2++) { row[t2] = expf(row[t2] - maxs); sum += row[t2]; } + for (int t2 = 0; t2 < SEQ; t2++) + scores[h*SEQ*SEQ + t*SEQ + t2] = (_Float16)(row[t2] / sum); + } + } + IOSurfaceUnlock(ioScores, 0, NULL); + } + + // Step 3: softmax_scores @ V on ANE + { + IOSurfaceRef ins[] = {ioScores, ioV}; + if (!ane_eval(&kSV, ins, 2, ioOut_decomp)) { printf("scores@V eval FAIL\n"); goto done; } + } + + // === Verify decomposed causal === + { + float scale = 1.0f / sqrtf((float)HD); + IOSurfaceLock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL); + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioOut_decomp); + float maxdiff = 0; + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + float scores[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 <= t; t2++) { + float s = 0; + for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; + s *= scale; scores[t2] = s; if(s>maxs) maxs=s; + } + float sum = 0; + for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } + for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum; + for (int d = 0; d < HD; d++) { + float ref = 0; + for (int t2 = 0; t2 <= t; t2++) ref += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; + float diff = fabsf((float)out[h*SEQ*HD+t*HD+d] - ref); + if(diff>maxdiff) maxdiff=diff; + } + } + IOSurfaceUnlock(ioOut_decomp, kIOSurfaceLockReadOnly, NULL); + printf("\nDecomposed causal max diff vs CPU ref: %.6f\n", maxdiff); + } + + // === Benchmark: SDPA vs decomposed === + printf("\n=== Benchmarks ===\n"); + int N = 500; + { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + // Warmup + for (int i = 0; i < 10; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa); + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) ane_eval(&kSDPA, ins, 3, ioOut_sdpa); + double ms = tb_ms(mach_absolute_time() - t0); + double flops = 4.0 * HEADS * SEQ * SEQ * HD; + printf("SDPA (non-causal): %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6); + } + { + // Decomposed: QKT + CPU softmax + SV + // Warmup + for (int i = 0; i < 10; i++) { + IOSurfaceRef ins1[] = {ioQ, ioK}; + ane_eval(&kQKT, ins1, 2, ioScores); + // Skip CPU softmax in benchmark for ANE-only timing + IOSurfaceRef ins2[] = {ioScores, ioV}; + ane_eval(&kSV, ins2, 2, ioOut_decomp); + } + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) { + IOSurfaceRef ins1[] = {ioQ, ioK}; + ane_eval(&kQKT, ins1, 2, ioScores); + // CPU softmax + causal mask + IOSurfaceLock(ioScores, 0, NULL); + _Float16 *sc = (_Float16*)IOSurfaceGetBaseAddress(ioScores); + float scale = 1.0f / sqrtf((float)HD); + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + float row[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + float s = (float)sc[h*SEQ*SEQ+t*SEQ+t2] * scale; + if (t2 > t) s = -1e30f; + row[t2] = s; if(s>maxs) maxs=s; + } + float sum = 0; + for (int t2 = 0; t2 < SEQ; t2++) { row[t2]=expf(row[t2]-maxs); sum+=row[t2]; } + for (int t2 = 0; t2 < SEQ; t2++) + sc[h*SEQ*SEQ+t*SEQ+t2] = (_Float16)(row[t2]/sum); + } + IOSurfaceUnlock(ioScores, 0, NULL); + IOSurfaceRef ins2[] = {ioScores, ioV}; + ane_eval(&kSV, ins2, 2, ioOut_decomp); + } + double ms = tb_ms(mach_absolute_time() - t0); + double flops = 4.0 * HEADS * SEQ * SEQ * HD; + printf("Decomposed causal: %.3f ms/eval, %.1f GFLOPS\n", ms/N, N*flops/ms/1e6); + } + + CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV); + CFRelease(ioScores); CFRelease(ioOut_sdpa); CFRelease(ioOut_decomp); + free(Q); free(K); free(V); + + done: + cleanup_kern(&kSDPA); + cleanup_kern(&kQKT); + cleanup_kern(&kSV); + printf("\nDONE\n"); + } + return 0; +} diff --git a/training/test_ane_sdpa5.m b/training/test_ane_sdpa5.m index 0ddce84..70a6d40 100644 --- a/training/test_ane_sdpa5.m +++ b/training/test_ane_sdpa5.m @@ -1,297 +1,304 @@ -// Debug: why causal mask doesn't apply. Try different approaches. -#import -#import -#import -#import -#include - -#define HEADS 12 -#define HD 64 -#define SEQ 8 // small for readable output - -static Class g_D, g_I, g_AR, g_AIO; -static void ane_init(void) { - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - g_I = NSClassFromString(@"_ANEInMemoryModel"); - g_AR = NSClassFromString(@"_ANERequest"); - g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); -} -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -// Build inline mask string for MIL: tensor([v00, v01, ...]) -static NSString *build_inline_causal_mask(int s) { - NSMutableString *vals = [NSMutableString string]; - for (int t = 0; t < s; t++) { - for (int t2 = 0; t2 < s; t2++) { - if (t > 0 || t2 > 0) [vals appendString:@", "]; - [vals appendString:(t2 <= t) ? @"0" : @"-65504"]; // fp16 -inf - } - } - return [NSString stringWithFormat: - @"tensor([%@])", s, s, vals]; -} - -static NSData *build_mask_blob(int seq) { - int wsize = seq * seq * 2; - int total = 128 + wsize; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; - *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; - _Float16 *fp16 = (_Float16*)(buf+128); - for (int t = 0; t < seq; t++) - for (int t2 = 0; t2 < seq; t2++) - fp16[t*seq + t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f); - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -typedef struct { id model; NSString *td; } Model; - -static Model compile_model(NSString *mil, NSDictionary *wd) { - Model m = {nil, nil}; - NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); - if (!desc) { printf(" desc=NULL\n"); return m; } - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; - [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - for (NSString *path in wd) { - [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; - } - NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { - printf(" compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:""); - [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m; - } - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { - printf(" load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m; - } - m.model = mdl; m.td = td; - return m; -} - -static void cleanup_model(Model *m) { - if (!m->model) return; - NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(m->model, @selector(unloadWithQoS:error:), 21, &e); - [[NSFileManager defaultManager] removeItemAtPath:m->td error:nil]; -} - -int main() { - @autoreleasepool { - setbuf(stdout, NULL); - ane_init(); - - srand48(42); - int total = HEADS * SEQ * HD; - _Float16 *Q = (_Float16*)malloc(total * 2); - _Float16 *K = (_Float16*)malloc(total * 2); - _Float16 *V = (_Float16*)malloc(total * 2); - for (int i = 0; i < total; i++) { - Q[i] = (_Float16)(0.5f * (2*drand48()-1)); - K[i] = (_Float16)(0.5f * (2*drand48()-1)); - V[i] = (_Float16)(0.5f * (2*drand48()-1)); - } - - size_t bytes = total * 2; - IOSurfaceRef ioQ = make_surface(bytes), ioK = make_surface(bytes); - IOSurfaceRef ioV = make_surface(bytes); - IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, bytes); IOSurfaceUnlock(ioQ, 0, NULL); - IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, bytes); IOSurfaceUnlock(ioK, 0, NULL); - IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, bytes); IOSurfaceUnlock(ioV, 0, NULL); - id wQ = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioQ); - id wK = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioK); - id wV = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioV); - - // CPU references - float scale = 1.0f / sqrtf((float)HD); - float *cpu_causal = (float*)calloc(total, sizeof(float)); - float *cpu_nocausal = (float*)calloc(total, sizeof(float)); - for (int h = 0; h < HEADS; h++) - for (int t = 0; t < SEQ; t++) { - // Causal - float scores[SEQ], maxs = -1e30f; - for (int t2 = 0; t2 <= t; t2++) { - float s = 0; - for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; - s *= scale; scores[t2] = s; if(s>maxs) maxs=s; - } - float sum = 0; - for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } - for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum; - for (int d = 0; d < HD; d++) { - float r = 0; - for (int t2 = 0; t2 <= t; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; - cpu_causal[h*SEQ*HD+t*HD+d] = r; - } - // Non-causal - maxs = -1e30f; - for (int t2 = 0; t2 < SEQ; t2++) { - float s = 0; - for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; - s *= scale; scores[t2] = s; if(s>maxs) maxs=s; - } - sum = 0; - for (int t2 = 0; t2 < SEQ; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } - for (int t2 = 0; t2 < SEQ; t2++) scores[t2]/=sum; - for (int d = 0; d < HD; d++) { - float r = 0; - for (int t2 = 0; t2 < SEQ; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; - cpu_nocausal[h*SEQ*HD+t*HD+d] = r; - } - } - - // Helper: eval and compare - void (^eval_and_compare)(const char*, Model*, int nInputs, IOSurfaceRef*) = - ^(const char *label, Model *m, int nInputs, IOSurfaceRef *inputs) { - IOSurfaceRef ioO = make_surface(bytes); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); - NSMutableArray *inArr = [NSMutableArray array]; - NSMutableArray *inIdx = [NSMutableArray array]; - for (int i = 0; i < nInputs; i++) { - [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), inputs[i])]; - [inIdx addObject:@(i)]; - } - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - inArr, inIdx, @[wO], @[@0], nil, nil, @0); - NSError *e = nil; - BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - m->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - if (!ok) { - printf(" %s: eval FAIL: %s\n", label, e?[[[e localizedDescription] substringToIndex:MIN(200,(int)[[e localizedDescription] length])] UTF8String]:""); - CFRelease(ioO); return; - } - IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); - _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioO); - float dc=0, dnc=0; - for (int i = 0; i < total; i++) { - float v = (float)out[i]; - float d1 = fabsf(v - cpu_causal[i]); if(d1>dc) dc=d1; - float d2 = fabsf(v - cpu_nocausal[i]); if(d2>dnc) dnc=d2; - } - IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); - printf(" %s: diff_causal=%.6f diff_nocausal=%.6f → %s\n", label, dc, dnc, - dc < dnc ? "CAUSAL" : (dc > dnc ? "NON-CAUSAL" : "SAME")); - CFRelease(ioO); - }; - - // === Test 1: No mask (should be non-causal) === - printf("Test 1: no mask\n"); - { - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " - "tensor k, tensor v) {\n" - " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" - " } -> (att);\n}\n", - HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; - Model m = compile_model(mil, nil); - if (m.model) { - IOSurfaceRef ins[] = {ioQ, ioK, ioV}; - eval_and_compare("no-mask", &m, 3, ins); - cleanup_model(&m); - } - } - - // === Test 2: Inline causal mask === - printf("\nTest 2: inline causal mask\n"); - { - NSString *maskStr = build_inline_causal_mask(SEQ); - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " - "tensor k, tensor v) {\n" - " %@ mask = const()[name = string(\"mask\"), val = %@];\n" - " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" - " } -> (att);\n}\n", - HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, - [NSString stringWithFormat:@"tensor", SEQ, SEQ], maskStr, - HEADS, SEQ, HD]; - Model m = compile_model(mil, nil); - if (m.model) { - IOSurfaceRef ins[] = {ioQ, ioK, ioV}; - eval_and_compare("inline-mask", &m, 3, ins); - cleanup_model(&m); - } - } - - // === Test 3: BLOBFILE mask === - printf("\nTest 3: BLOBFILE causal mask\n"); - { - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " - "tensor k, tensor v) {\n" - " tensor mask = const()[name = string(\"mask\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" - " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" - " } -> (att);\n}\n", - HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, - SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD]; - NSDictionary *wd = @{@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_mask_blob(SEQ)}}; - Model m = compile_model(mil, wd); - if (m.model) { - IOSurfaceRef ins[] = {ioQ, ioK, ioV}; - eval_and_compare("blob-mask", &m, 3, ins); - cleanup_model(&m); - } - } - - // === Test 4: mask as runtime input === - printf("\nTest 4: mask as runtime input\n"); - { - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " - "tensor k, tensor v, " - "tensor mask) {\n" - " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" - " } -> (att);\n}\n", - HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, - SEQ, SEQ, HEADS, SEQ, HD]; - Model m = compile_model(mil, nil); - if (m.model) { - // Create mask IOSurface - size_t mbytes = SEQ * SEQ * 2; - IOSurfaceRef ioM = make_surface(mbytes); - IOSurfaceLock(ioM, 0, NULL); - _Float16 *mp = (_Float16*)IOSurfaceGetBaseAddress(ioM); - for (int t = 0; t < SEQ; t++) - for (int t2 = 0; t2 < SEQ; t2++) - mp[t*SEQ+t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f); - IOSurfaceUnlock(ioM, 0, NULL); - - IOSurfaceRef ins[] = {ioQ, ioK, ioV, ioM}; - eval_and_compare("runtime-mask", &m, 4, ins); - CFRelease(ioM); - cleanup_model(&m); - } - } - - CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV); - free(Q); free(K); free(V); - free(cpu_causal); free(cpu_nocausal); - printf("\nDONE\n"); - } - return 0; -} +// Debug: why causal mask doesn't apply. Try different approaches. +#import +#import +#import +#import +#include +#include "ane_compat.h" + +#define HEADS 12 +#define HD 64 +#define SEQ 8 // small for readable output + +static Class g_D, g_I, g_AR, g_AIO; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +// Build inline mask string for MIL: tensor([v00, v01, ...]) +static NSString *build_inline_causal_mask(int s) { + NSMutableString *vals = [NSMutableString string]; + for (int t = 0; t < s; t++) { + for (int t2 = 0; t2 < s; t2++) { + if (t > 0 || t2 > 0) [vals appendString:@", "]; + [vals appendString:(t2 <= t) ? @"0" : @"-65504"]; // fp16 -inf + } + } + return [NSString stringWithFormat: + @"tensor([%@])", s, s, vals]; +} + +static NSData *build_mask_blob(int seq) { + int wsize = seq * seq * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + _Float16 *fp16 = (_Float16*)(buf+128); + for (int t = 0; t < seq; t++) + for (int t2 = 0; t2 < seq; t2++) + fp16[t*seq + t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f); + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +typedef struct { id model; NSString *td; } Model; + +static Model compile_model(NSString *mil, NSDictionary *wd) { + Model m = {nil, nil}; + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd ?: @{}, nil); + if (!desc) { printf(" desc=NULL\n"); return m; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; + } + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + printf(" compile FAIL: %s\n", e?[[[e localizedDescription] substringToIndex:MIN(300,(int)[[e localizedDescription] length])] UTF8String]:""); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m; + } + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) { + printf(" load FAIL\n"); [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; return m; + } + m.model = mdl; m.td = td; + return m; +} + +static void cleanup_model(Model *m) { + if (!m->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(m->model, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:m->td error:nil]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + ane_detect_platform(); + ane_print_platform(); + + srand48(42); + int total = HEADS * SEQ * HD; + _Float16 *Q = (_Float16*)malloc(total * 2); + _Float16 *K = (_Float16*)malloc(total * 2); + _Float16 *V = (_Float16*)malloc(total * 2); + for (int i = 0; i < total; i++) { + Q[i] = (_Float16)(0.5f * (2*drand48()-1)); + K[i] = (_Float16)(0.5f * (2*drand48()-1)); + V[i] = (_Float16)(0.5f * (2*drand48()-1)); + } + + size_t bytes = total * 2; + IOSurfaceRef ioQ = make_surface(bytes), ioK = make_surface(bytes); + IOSurfaceRef ioV = make_surface(bytes); + IOSurfaceLock(ioQ, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioQ), Q, bytes); IOSurfaceUnlock(ioQ, 0, NULL); + IOSurfaceLock(ioK, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioK), K, bytes); IOSurfaceUnlock(ioK, 0, NULL); + IOSurfaceLock(ioV, 0, NULL); memcpy(IOSurfaceGetBaseAddress(ioV), V, bytes); IOSurfaceUnlock(ioV, 0, NULL); + id wQ = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioQ); + id wK = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioK); + id wV = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioV); + + // CPU references + float scale = 1.0f / sqrtf((float)HD); + float *cpu_causal = (float*)calloc(total, sizeof(float)); + float *cpu_nocausal = (float*)calloc(total, sizeof(float)); + for (int h = 0; h < HEADS; h++) + for (int t = 0; t < SEQ; t++) { + // Causal + float scores[SEQ], maxs = -1e30f; + for (int t2 = 0; t2 <= t; t2++) { + float s = 0; + for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; + s *= scale; scores[t2] = s; if(s>maxs) maxs=s; + } + float sum = 0; + for (int t2 = 0; t2 <= t; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } + for (int t2 = 0; t2 <= t; t2++) scores[t2]/=sum; + for (int d = 0; d < HD; d++) { + float r = 0; + for (int t2 = 0; t2 <= t; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; + cpu_causal[h*SEQ*HD+t*HD+d] = r; + } + // Non-causal + maxs = -1e30f; + for (int t2 = 0; t2 < SEQ; t2++) { + float s = 0; + for (int d = 0; d < HD; d++) s += (float)Q[h*SEQ*HD+t*HD+d]*(float)K[h*SEQ*HD+t2*HD+d]; + s *= scale; scores[t2] = s; if(s>maxs) maxs=s; + } + sum = 0; + for (int t2 = 0; t2 < SEQ; t2++) { scores[t2]=expf(scores[t2]-maxs); sum+=scores[t2]; } + for (int t2 = 0; t2 < SEQ; t2++) scores[t2]/=sum; + for (int d = 0; d < HD; d++) { + float r = 0; + for (int t2 = 0; t2 < SEQ; t2++) r += scores[t2]*(float)V[h*SEQ*HD+t2*HD+d]; + cpu_nocausal[h*SEQ*HD+t*HD+d] = r; + } + } + + // Helper: eval and compare + void (^eval_and_compare)(const char*, Model*, int nInputs, IOSurfaceRef*) = + ^(const char *label, Model *m, int nInputs, IOSurfaceRef *inputs) { + IOSurfaceRef ioO = make_surface(bytes); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + NSMutableArray *inArr = [NSMutableArray array]; + NSMutableArray *inIdx = [NSMutableArray array]; + for (int i = 0; i < nInputs; i++) { + [inArr addObject:((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), inputs[i])]; + [inIdx addObject:@(i)]; + } + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + inArr, inIdx, @[wO], @[@0], nil, nil, @0); + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + m->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + if (!ok) { + printf(" %s: eval FAIL: %s\n", label, e?[[[e localizedDescription] substringToIndex:MIN(200,(int)[[e localizedDescription] length])] UTF8String]:""); + CFRelease(ioO); return; + } + IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(ioO); + float dc=0, dnc=0; + for (int i = 0; i < total; i++) { + float v = (float)out[i]; + float d1 = fabsf(v - cpu_causal[i]); if(d1>dc) dc=d1; + float d2 = fabsf(v - cpu_nocausal[i]); if(d2>dnc) dnc=d2; + } + IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); + printf(" %s: diff_causal=%.6f diff_nocausal=%.6f → %s\n", label, dc, dnc, + dc < dnc ? "CAUSAL" : (dc > dnc ? "NON-CAUSAL" : "SAME")); + CFRelease(ioO); + }; + + // === Test 1: No mask (should be non-causal) === + printf("Test 1: no mask\n"); + { + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor q, " + "tensor k, tensor v) {\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; + Model m = compile_model(mil, nil); + if (m.model) { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + eval_and_compare("no-mask", &m, 3, ins); + cleanup_model(&m); + } + } + + // === Test 2: Inline causal mask === + printf("\nTest 2: inline causal mask\n"); + { + NSString *maskStr = build_inline_causal_mask(SEQ); + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor q, " + "tensor k, tensor v) {\n" + " %@ mask = const()[name = string(\"mask\"), val = %@];\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, + [NSString stringWithFormat:@"tensor", SEQ, SEQ], maskStr, + HEADS, SEQ, HD]; + Model m = compile_model(mil, nil); + if (m.model) { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + eval_and_compare("inline-mask", &m, 3, ins); + cleanup_model(&m); + } + } + + // === Test 3: BLOBFILE mask === + printf("\nTest 3: BLOBFILE causal mask\n"); + { + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor q, " + "tensor k, tensor v) {\n" + " tensor mask = const()[name = string(\"mask\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, + SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD]; + NSDictionary *wd = @{@"@model_path/weights/mask.bin": @{@"offset":@0, @"data":build_mask_blob(SEQ)}}; + Model m = compile_model(mil, wd); + if (m.model) { + IOSurfaceRef ins[] = {ioQ, ioK, ioV}; + eval_and_compare("blob-mask", &m, 3, ins); + cleanup_model(&m); + } + } + + // === Test 4: mask as runtime input === + printf("\nTest 4: mask as runtime input\n"); + { + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor q, " + "tensor k, tensor v, " + "tensor mask) {\n" + " tensor att = scaled_dot_product_attention(" + "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + " } -> (att);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, + SEQ, SEQ, HEADS, SEQ, HD]; + Model m = compile_model(mil, nil); + if (m.model) { + // Create mask IOSurface + size_t mbytes = SEQ * SEQ * 2; + IOSurfaceRef ioM = make_surface(mbytes); + IOSurfaceLock(ioM, 0, NULL); + _Float16 *mp = (_Float16*)IOSurfaceGetBaseAddress(ioM); + for (int t = 0; t < SEQ; t++) + for (int t2 = 0; t2 < SEQ; t2++) + mp[t*SEQ+t2] = (t2 <= t) ? (_Float16)0.0f : (_Float16)(-65504.0f); + IOSurfaceUnlock(ioM, 0, NULL); + + IOSurfaceRef ins[] = {ioQ, ioK, ioV, ioM}; + eval_and_compare("runtime-mask", &m, 4, ins); + CFRelease(ioM); + cleanup_model(&m); + } + } + + CFRelease(ioQ); CFRelease(ioK); CFRelease(ioV); + free(Q); free(K); free(V); + free(cpu_causal); free(cpu_nocausal); + printf("\nDONE\n"); + } + return 0; +} diff --git a/training/test_conv_attn3.m b/training/test_conv_attn3.m index a396b4d..a52a11b 100644 --- a/training/test_conv_attn3.m +++ b/training/test_conv_attn3.m @@ -5,6 +5,7 @@ #import #import #include +#include "ane_compat.h" #define HEADS 12 #define HD 64 @@ -82,10 +83,10 @@ static void cleanup_kern(Kern *k) { static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) { return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" " tensor W = const()[name = string(\"W\"), " "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n" " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" @@ -95,13 +96,15 @@ static void cleanup_kern(Kern *k) { " int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n" " tensor y = conv(dilations = dl, groups = gr, pad = pd, " "pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n" - " } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp]; + " } -> (y);\n}\n", g_ane_platform.mil_program, ane_mil_target(), ic, sp, oc, icg, oc, icg, groups, oc, sp]; } int main() { @autoreleasepool { setbuf(stdout, NULL); ane_init(); + ane_detect_platform(); + ane_print_platform(); mach_timebase_info(&g_tb); printf("=== Grouped Conv Causal Attention (layout A) ===\n"); diff --git a/training/test_full_fused.m b/training/test_full_fused.m index 8449ddb..f9d36d4 100644 --- a/training/test_full_fused.m +++ b/training/test_full_fused.m @@ -7,6 +7,7 @@ #import #import #include +#include "ane_compat.h" #define DIM 768 #define HEADS 12 @@ -104,6 +105,8 @@ int main() { @autoreleasepool { setbuf(stdout, NULL); ane_init(); + ane_detect_platform(); + ane_print_platform(); mach_timebase_info(&g_tb); srand48(42); @@ -130,10 +133,10 @@ int main() { float scale_val = 1.0f / sqrtf((float)HD); NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" // Conv boilerplate " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" @@ -189,6 +192,7 @@ int main() { " tensor out = conv(dilations = dl, groups = gr1, pad = pd, " "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n" " } -> (out);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), DIM, SEQ, // input DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wq, Wk DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wv, Wo @@ -317,10 +321,10 @@ int main() { printf("\n=== Test 2: Fused FFN benchmark ===\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" @@ -342,6 +346,7 @@ int main() { " tensor out = conv(dilations = dl, groups = gr, pad = pd, " "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n" " } -> (out);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), DIM, SEQ, HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, HIDDEN,SEQ, DIM,SEQ]; diff --git a/training/test_fused_bwd.m b/training/test_fused_bwd.m index b91d7b6..9dd6f9b 100644 --- a/training/test_fused_bwd.m +++ b/training/test_fused_bwd.m @@ -1,184 +1,188 @@ -// Test: fused backward dx kernels -// 1. Fused QKV backward: concat(Wq^T@dq, Wk^T@dk, Wv^T@dv) — 3 inputs, 1 output -// Problem: 3 separate gradient inputs. Can we concat them as input? -// Input: [1, DIM*3, 1, SEQ] = concat(dq, dk, dv) -// Use 3 separate convs on slices? MIL has slice_by_size. -// 2. Fused W1b+W3b: input concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ] -// Two convs on slices, add results → [1, DIM, 1, SEQ] -#import -#import -#import -#import -#include - -#define DIM 768 -#define HIDDEN 2048 -#define SEQ 64 - -static Class g_D, g_I, g_AR, g_AIO; -static void ane_init(void) { - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - g_I = NSClassFromString(@"_ANEInMemoryModel"); - g_AR = NSClassFromString(@"_ANERequest"); - g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); -} -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} -static NSData *build_blob_t(const float *w, int rows, int cols) { - int wsize = cols * rows * 2, total = 128 + wsize; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; - *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; - _Float16 *fp16 = (_Float16*)(buf+128); - for (int i = 0; i < rows; i++) - for (int j = 0; j < cols; j++) - fp16[j*rows+i] = (_Float16)w[i*cols+j]; - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -int main() { - @autoreleasepool { - setbuf(stdout, NULL); - ane_init(); - - srand48(42); - float *W1 = (float*)malloc(HIDDEN*DIM*sizeof(float)); - float *W3 = (float*)malloc(HIDDEN*DIM*sizeof(float)); - float sc = 1.0f/sqrtf(HIDDEN); - for (int i = 0; i < HIDDEN*DIM; i++) { W1[i]=sc*(2*drand48()-1); W3[i]=sc*(2*drand48()-1); } - - // Test: fused W1b+W3b backward - // Input: concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ] - // Output: W1^T@dh1 + W3^T@dh3 [1, DIM, 1, SEQ] - // MIL: slice input → 2 convs → add - printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n"); - - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" // [1, HIDDEN*2, 1, SEQ] - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :] - " tensor b1 = const()[name = string(\"b1\"), val = tensor([0, 0, 0, 0])];\n" - " tensor s1 = const()[name = string(\"s1\"), val = tensor([1, %d, 1, %d])];\n" - " tensor dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n" - " tensor b3 = const()[name = string(\"b3\"), val = tensor([0, %d, 0, 0])];\n" - " tensor s3 = const()[name = string(\"s3\"), val = tensor([1, %d, 1, %d])];\n" - " tensor dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n" - // Conv: W1^T @ dh1, W3^T @ dh3 - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - // W1^T: [DIM, HIDDEN, 1, 1] (transposed from [HIDDEN, DIM]) - " tensor W1t = const()[name = string(\"W1t\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n" - " tensor W3t = const()[name = string(\"W3t\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n" - " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n" - " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n" - // Add - " tensor sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n" - " } -> (y);\n}\n", - HIDDEN*2, SEQ, HIDDEN*2, SEQ, - HIDDEN, SEQ, HIDDEN, SEQ, // slice1 - HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, // slice3 - DIM, HIDDEN, DIM, HIDDEN, // W1t - DIM, HIDDEN, DIM, HIDDEN, // W3t - DIM, SEQ, DIM, SEQ, // dx1, dx3 - DIM, SEQ, DIM, SEQ]; // sum, y - - NSDictionary *wd = @{ - @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)}, - @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3, HIDDEN, DIM)} - }; - - NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd, nil); - if (!desc) { printf("desc=NULL\n"); return 1; } - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; - [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - for (NSString *path in wd) { - [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; - } - - NSError *e = nil; - BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); - printf("Compile: %s\n", ok?"OK":"FAIL"); - if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } - ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - printf("Load: %s\n", ok?"OK":"FAIL"); - if (!ok) return 1; - - // Prepare input: concat(dh1, dh3) in channel-first layout - float *dh1 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); - float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); - for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); } - - IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4); - IOSurfaceLock(ioI, 0, NULL); - float *dst = (float*)IOSurfaceGetBaseAddress(ioI); - // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3 - for (int t = 0; t < SEQ; t++) { - for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c]; - for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c]; - } - IOSurfaceUnlock(ioI, 0, NULL); - - id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - - ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - printf("Eval: %s\n", ok?"OK":"FAIL"); - if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } - - // CPU reference: dx = W1^T @ dh1 + W3^T @ dh3 - float *ref = (float*)calloc(SEQ*DIM, sizeof(float)); - for (int t = 0; t < SEQ; t++) - for (int i = 0; i < DIM; i++) { - float s = 0; - for (int j = 0; j < HIDDEN; j++) { - s += W1[j*DIM+i] * dh1[t*HIDDEN+j]; // W1^T[i,j] = W1[j,i] - s += W3[j*DIM+i] * dh3[t*HIDDEN+j]; - } - ref[t*DIM+i] = s; - } - - IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); - float *src = (float*)IOSurfaceGetBaseAddress(ioO); - float maxd = 0; - for (int t = 0; t < SEQ; t++) - for (int c = 0; c < DIM; c++) { - float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]); - if (d > maxd) maxd = d; - } - IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); - printf("dx max diff: %.6f\n", maxd); - - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; - CFRelease(ioI); CFRelease(ioO); - free(W1); free(W3); free(dh1); free(dh3); free(ref); - printf("\nDONE\n"); - } - return 0; -} +// Test: fused backward dx kernels +// 1. Fused QKV backward: concat(Wq^T@dq, Wk^T@dk, Wv^T@dv) — 3 inputs, 1 output +// Problem: 3 separate gradient inputs. Can we concat them as input? +// Input: [1, DIM*3, 1, SEQ] = concat(dq, dk, dv) +// Use 3 separate convs on slices? MIL has slice_by_size. +// 2. Fused W1b+W3b: input concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ] +// Two convs on slices, add results → [1, DIM, 1, SEQ] +#import +#import +#import +#import +#include +#include "ane_compat.h" + +#define DIM 768 +#define HIDDEN 2048 +#define SEQ 64 + +static Class g_D, g_I, g_AR, g_AIO; +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} +static NSData *build_blob_t(const float *w, int rows, int cols) { + int wsize = cols * rows * 2, total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0]=1; buf[4]=2; buf[64]=0xEF; buf[65]=0xBE; buf[66]=0xAD; buf[67]=0xDE; buf[68]=1; + *(uint32_t*)(buf+72)=wsize; *(uint32_t*)(buf+80)=128; + _Float16 *fp16 = (_Float16*)(buf+128); + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + fp16[j*rows+i] = (_Float16)w[i*cols+j]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + ane_detect_platform(); + ane_print_platform(); + + srand48(42); + float *W1 = (float*)malloc(HIDDEN*DIM*sizeof(float)); + float *W3 = (float*)malloc(HIDDEN*DIM*sizeof(float)); + float sc = 1.0f/sqrtf(HIDDEN); + for (int i = 0; i < HIDDEN*DIM; i++) { W1[i]=sc*(2*drand48()-1); W3[i]=sc*(2*drand48()-1); } + + // Test: fused W1b+W3b backward + // Input: concat(dh1, dh3) [1, HIDDEN*2, 1, SEQ] + // Output: W1^T@dh1 + W3^T@dh3 [1, DIM, 1, SEQ] + // MIL: slice input → 2 convs → add + printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n"); + + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" // [1, HIDDEN*2, 1, SEQ] + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :] + " tensor b1 = const()[name = string(\"b1\"), val = tensor([0, 0, 0, 0])];\n" + " tensor s1 = const()[name = string(\"s1\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n" + " tensor b3 = const()[name = string(\"b3\"), val = tensor([0, %d, 0, 0])];\n" + " tensor s3 = const()[name = string(\"s3\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n" + // Conv: W1^T @ dh1, W3^T @ dh3 + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + // W1^T: [DIM, HIDDEN, 1, 1] (transposed from [HIDDEN, DIM]) + " tensor W1t = const()[name = string(\"W1t\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n" + " tensor W3t = const()[name = string(\"W3t\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n" + " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n" + " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n" + // Add + " tensor sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + HIDDEN*2, SEQ, HIDDEN*2, SEQ, + HIDDEN, SEQ, HIDDEN, SEQ, // slice1 + HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, // slice3 + DIM, HIDDEN, DIM, HIDDEN, // W1t + DIM, HIDDEN, DIM, HIDDEN, // W3t + DIM, SEQ, DIM, SEQ, // dx1, dx3 + DIM, SEQ, DIM, SEQ]; // sum, y + + NSDictionary *wd = @{ + @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)}, + @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(W3, HIDDEN, DIM)} + }; + + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), md, wd, nil); + if (!desc) { printf("desc=NULL\n"); return 1; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + for (NSString *path in wd) { + [wd[path][@"data"] writeToFile:[td stringByAppendingPathComponent:[path stringByReplacingOccurrencesOfString:@"@model_path/" withString:@""]] atomically:YES]; + } + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + printf("Compile: %s\n", ok?"OK":"FAIL"); + if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + printf("Load: %s\n", ok?"OK":"FAIL"); + if (!ok) return 1; + + // Prepare input: concat(dh1, dh3) in channel-first layout + float *dh1 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); + float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); + for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); } + + IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4); + IOSurfaceLock(ioI, 0, NULL); + float *dst = (float*)IOSurfaceGetBaseAddress(ioI); + // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3 + for (int t = 0; t < SEQ; t++) { + for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c]; + for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c]; + } + IOSurfaceUnlock(ioI, 0, NULL); + + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + printf("Eval: %s\n", ok?"OK":"FAIL"); + if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } + + // CPU reference: dx = W1^T @ dh1 + W3^T @ dh3 + float *ref = (float*)calloc(SEQ*DIM, sizeof(float)); + for (int t = 0; t < SEQ; t++) + for (int i = 0; i < DIM; i++) { + float s = 0; + for (int j = 0; j < HIDDEN; j++) { + s += W1[j*DIM+i] * dh1[t*HIDDEN+j]; // W1^T[i,j] = W1[j,i] + s += W3[j*DIM+i] * dh3[t*HIDDEN+j]; + } + ref[t*DIM+i] = s; + } + + IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); + float *src = (float*)IOSurfaceGetBaseAddress(ioO); + float maxd = 0; + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) { + float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]); + if (d > maxd) maxd = d; + } + IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); + printf("dx max diff: %.6f\n", maxd); + + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(ioI); CFRelease(ioO); + free(W1); free(W3); free(dh1); free(dh3); free(ref); + printf("\nDONE\n"); + } + return 0; +} diff --git a/training/test_fused_qkv.m b/training/test_fused_qkv.m index 69f41d6..14428f6 100644 --- a/training/test_fused_qkv.m +++ b/training/test_fused_qkv.m @@ -8,6 +8,7 @@ #import #import #include +#include "ane_compat.h" #define DIM 768 #define SEQ 64 @@ -86,10 +87,10 @@ static void cleanup_kern(Kern *k) { // Fused QKV: 3 convs + concat in one MIL static NSString *gen_fused_qkv_mil(void) { return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" @@ -115,6 +116,7 @@ static void cleanup_kern(Kern *k) { " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" " tensor y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n" " } -> (y);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, // Wq DIM, DIM, DIM, DIM, // Wk @@ -129,10 +131,10 @@ static void cleanup_kern(Kern *k) { // Single conv MIL for comparison static NSString *gen_single_mil(void) { return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" " tensor W = const()[name = string(\"W\"), " @@ -147,6 +149,7 @@ static void cleanup_kern(Kern *k) { " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" " } -> (y);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ]; } @@ -154,6 +157,8 @@ int main() { @autoreleasepool { setbuf(stdout, NULL); ane_init(); + ane_detect_platform(); + ane_print_platform(); mach_timebase_info(&g_tb); printf("=== Fused QKV vs 3x Separate Convs ===\n"); diff --git a/training/test_perf_stats.m b/training/test_perf_stats.m index cf7b073..d7c1665 100644 --- a/training/test_perf_stats.m +++ b/training/test_perf_stats.m @@ -1,233 +1,236 @@ -// test_perf_stats.m — What does _ANEPerformanceStats expose? -// Probe class methods, properties, instantiate, pass to request, read back. -#import -#import -#import -#import -#import -#import - -static mach_timebase_info_data_t g_tb; -static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } - -static void dump_class(const char *name) { - Class cls = NSClassFromString([NSString stringWithUTF8String:name]); - if (!cls) { printf(" %s: NOT FOUND\n", name); return; } - printf("\n=== %s ===\n", name); - - unsigned int count; - Method *methods = class_copyMethodList(object_getClass(cls), &count); - if (count) printf(" Class methods:\n"); - for (unsigned int i = 0; i < count; i++) { - SEL s = method_getName(methods[i]); - const char *enc = method_getTypeEncoding(methods[i]); - printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); - } - free(methods); - - methods = class_copyMethodList(cls, &count); - if (count) printf(" Instance methods:\n"); - for (unsigned int i = 0; i < count; i++) { - SEL s = method_getName(methods[i]); - const char *enc = method_getTypeEncoding(methods[i]); - printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); - } - free(methods); - - unsigned int pcount; - objc_property_t *props = class_copyPropertyList(cls, &pcount); - if (pcount) printf(" Properties:\n"); - for (unsigned int i = 0; i < pcount; i++) { - const char *pname = property_getName(props[i]); - const char *pattr = property_getAttributes(props[i]); - printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); - } - free(props); -} - -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -int main() { - @autoreleasepool { - setbuf(stdout, NULL); - mach_timebase_info(&g_tb); - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - - printf("=== ANE Performance Stats Probe ===\n"); - - dump_class("_ANEPerformanceStats"); - dump_class("_ANEPerfRequest"); - dump_class("ANEPerfRequest"); - dump_class("_ANEPerformanceCounters"); - dump_class("_ANEDeviceInfo"); - dump_class("_ANEModel"); - dump_class("_ANEInMemoryModel"); - dump_class("_ANERequest"); - dump_class("_ANEIOSurfaceObject"); - dump_class("_ANEInMemoryModelDescriptor"); - dump_class("_ANEClient"); - dump_class("_ANEVirtualClient"); - - // Try to instantiate _ANEPerformanceStats - printf("\n=== Instantiation Tests ===\n"); - Class perfClass = NSClassFromString(@"_ANEPerformanceStats"); - if (perfClass) { - @try { - id perfStats = [[perfClass alloc] init]; - printf("_ANEPerformanceStats alloc/init: %s\n", - perfStats ? [[perfStats description] UTF8String] : "nil"); - if (perfStats) { - unsigned int pcount; - objc_property_t *props = class_copyPropertyList(perfClass, &pcount); - for (unsigned int i = 0; i < pcount; i++) { - const char *pname = property_getName(props[i]); - @try { - id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; - printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); - } @catch (NSException *ex) { - printf(" %s = \n", pname, [[ex reason] UTF8String]); - } - } - free(props); - } - } @catch (NSException *ex) { - printf("Exception: %s\n", [[ex reason] UTF8String]); - } - } - - // Compile a working kernel and test perfStats in request - printf("\n=== Compile kernel and test perfStats in request ===\n"); - Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - Class g_I = NSClassFromString(@"_ANEInMemoryModel"); - Class g_AR = NSClassFromString(@"_ANERequest"); - Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); - - int CH = 64, SP = 32; - _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); - for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f; - int ws = CH*CH*2, tot = 128+ws; - uint8_t *blob = (uint8_t*)calloc(tot,1); - blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; - *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; - memcpy(blob+128, w, ws); - NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - free(w); - - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; - - NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), - md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] - withIntermediateDirectories:YES attributes:nil error:nil]; - [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - - NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - - int ioBytes = CH * SP * 4; // fp32 - IOSurfaceRef ioIn = make_surface(ioBytes); - IOSurfaceRef ioOut = make_surface(ioBytes); - id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); - - // Try creating request WITH perfStats - if (perfClass) { - id perfStats = [[perfClass alloc] init]; - printf(" Creating request with perfStats=%s\n", perfStats ? "non-nil" : "nil"); - - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, perfStats, @0); - printf(" Request: %s\n", req ? "created" : "nil"); - - if (req) { - IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; - IOSurfaceUnlock(ioIn, 0, NULL); - - BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - printf(" Eval: %s\n", ok ? "OK" : [[e description] UTF8String]); - - if (ok && perfStats) { - printf("\n PerfStats after 1 eval:\n"); - unsigned int pcount; - objc_property_t *props = class_copyPropertyList(perfClass, &pcount); - for (unsigned int i = 0; i < pcount; i++) { - const char *pname = property_getName(props[i]); - @try { - id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; - printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); - } @catch (NSException *ex) { - printf(" %s = \n", pname); - } - } - free(props); - - printf("\n Running 100 evals...\n"); - uint64_t t0 = mach_absolute_time(); - for (int i = 0; i < 100; i++) { - ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - } - printf(" 100 evals in %.1fms (%.2fms/eval)\n", - tb_ms(mach_absolute_time()-t0), tb_ms(mach_absolute_time()-t0)/100.0); - - printf("\n PerfStats after 101 evals:\n"); - props = class_copyPropertyList(perfClass, &pcount); - for (unsigned int i = 0; i < pcount; i++) { - const char *pname = property_getName(props[i]); - @try { - id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; - printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); - } @catch (NSException *ex) { - printf(" %s = \n", pname); - } - } - free(props); - } - } - } else { - printf(" _ANEPerformanceStats class NOT FOUND\n"); - } - - // Cleanup - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; - CFRelease(ioIn); CFRelease(ioOut); - } - return 0; -} +// test_perf_stats.m — What does _ANEPerformanceStats expose? +// Probe class methods, properties, instantiate, pass to request, read back. +#import +#import +#import +#import +#import +#import +#include "ane_compat.h" + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static void dump_class(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) { printf(" %s: NOT FOUND\n", name); return; } + printf("\n=== %s ===\n", name); + + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(cls), &count); + if (count) printf(" Class methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + methods = class_copyMethodList(cls, &count); + if (count) printf(" Instance methods:\n"); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + if (pcount) printf(" Properties:\n"); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + const char *pattr = property_getAttributes(props[i]); + printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); + } + free(props); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + ane_detect_platform(); + ane_print_platform(); + + printf("=== ANE Performance Stats Probe ===\n"); + + dump_class("_ANEPerformanceStats"); + dump_class("_ANEPerfRequest"); + dump_class("ANEPerfRequest"); + dump_class("_ANEPerformanceCounters"); + dump_class("_ANEDeviceInfo"); + dump_class("_ANEModel"); + dump_class("_ANEInMemoryModel"); + dump_class("_ANERequest"); + dump_class("_ANEIOSurfaceObject"); + dump_class("_ANEInMemoryModelDescriptor"); + dump_class("_ANEClient"); + dump_class("_ANEVirtualClient"); + + // Try to instantiate _ANEPerformanceStats + printf("\n=== Instantiation Tests ===\n"); + Class perfClass = NSClassFromString(@"_ANEPerformanceStats"); + if (perfClass) { + @try { + id perfStats = [[perfClass alloc] init]; + printf("_ANEPerformanceStats alloc/init: %s\n", + perfStats ? [[perfStats description] UTF8String] : "nil"); + if (perfStats) { + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(perfClass, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname, [[ex reason] UTF8String]); + } + } + free(props); + } + } @catch (NSException *ex) { + printf("Exception: %s\n", [[ex reason] UTF8String]); + } + } + + // Compile a working kernel and test perfStats in request + printf("\n=== Compile kernel and test perfStats in request ===\n"); + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + int CH = 64, SP = 32; + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)1.0f; + int ws = CH*CH*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot,1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, w, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + free(w); + + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" + " } -> (y);\n" + "}\n", g_ane_platform.mil_program, ane_mil_target(), CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [[NSFileManager defaultManager] createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + + int ioBytes = CH * SP * 4; // fp32 + IOSurfaceRef ioIn = make_surface(ioBytes); + IOSurfaceRef ioOut = make_surface(ioBytes); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + + // Try creating request WITH perfStats + if (perfClass) { + id perfStats = [[perfClass alloc] init]; + printf(" Creating request with perfStats=%s\n", perfStats ? "non-nil" : "nil"); + + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, perfStats, @0); + printf(" Request: %s\n", req ? "created" : "nil"); + + if (req) { + IOSurfaceLock(ioIn, 0, NULL); + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + IOSurfaceUnlock(ioIn, 0, NULL); + + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + printf(" Eval: %s\n", ok ? "OK" : [[e description] UTF8String]); + + if (ok && perfStats) { + printf("\n PerfStats after 1 eval:\n"); + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(perfClass, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname); + } + } + free(props); + + printf("\n Running 100 evals...\n"); + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < 100; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + } + printf(" 100 evals in %.1fms (%.2fms/eval)\n", + tb_ms(mach_absolute_time()-t0), tb_ms(mach_absolute_time()-t0)/100.0); + + printf("\n PerfStats after 101 evals:\n"); + props = class_copyPropertyList(perfClass, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [perfStats valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname); + } + } + free(props); + } + } + } else { + printf(" _ANEPerformanceStats class NOT FOUND\n"); + } + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(ioIn); CFRelease(ioOut); + } + return 0; +} diff --git a/training/test_qos_sweep.m b/training/test_qos_sweep.m index 2802c6b..c0dd7d2 100644 --- a/training/test_qos_sweep.m +++ b/training/test_qos_sweep.m @@ -1,157 +1,160 @@ -// test_qos_sweep.m — Does QoS affect frequency/latency? -// Sweep QoS 0-63 on compile, load, eval of a working kernel. -#import -#import -#import -#import -#import -#import - -static mach_timebase_info_data_t g_tb; -static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } - -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -int main() { - @autoreleasepool { - setbuf(stdout, NULL); - mach_timebase_info(&g_tb); - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - - Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - Class g_I = NSClassFromString(@"_ANEInMemoryModel"); - Class g_AR = NSClassFromString(@"_ANERequest"); - Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); - - // 256x256 conv, spatial=64 for measurable latency - int CH = 256, SP = 64; - int ws = CH*CH*2, tot = 128+ws; - uint8_t *blob = (uint8_t*)calloc(tot, 1); - blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; - *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; - _Float16 *wp = (_Float16*)(blob+128); - for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50)); - NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; - - NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}; - NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; - NSFileManager *fm = [NSFileManager defaultManager]; - - printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n"); - printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6); - printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status"); - - unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63}; - int n_qos = sizeof(qos_values)/sizeof(qos_values[0]); - - for (int qi = 0; qi < n_qos; qi++) { - unsigned int qos = qos_values[qi]; - NSError *e = nil; - - // Make unique weights per iteration so hex differs - _Float16 *wq = (_Float16*)(blob+128); - wq[0] = (_Float16)(0.001f * qi); - NSData *wdata_q = [NSData dataWithBytes:blob length:tot]; - NSDictionary *weights_q = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata_q}}; - - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), - milData, weights_q, nil); - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] - withIntermediateDirectories:YES attributes:nil error:nil]; - [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - [wdata_q writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - - uint64_t t0 = mach_absolute_time(); - BOOL cok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( - mdl, @selector(compileWithQoS:options:error:), qos, @{}, &e); - double cms = tb_ms(mach_absolute_time() - t0); - - if (!cok) { - printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-"); - [fm removeItemAtPath:td error:nil]; - continue; - } - - t0 = mach_absolute_time(); - BOOL lok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( - mdl, @selector(loadWithQoS:options:error:), qos, @{}, &e); - double lms = tb_ms(mach_absolute_time() - t0); - - if (!lok) { - printf("%4u %8.1fms %10s %10s %10s LOAD_FAIL\n", qos, cms, "-", "-", "-"); - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - [fm removeItemAtPath:td error:nil]; - continue; - } - - int ioBytes = CH * SP * 4; - IOSurfaceRef ioIn = make_surface(ioBytes); - IOSurfaceRef ioOut = make_surface(ioBytes); - id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - - IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f; - IOSurfaceUnlock(ioIn, 0, NULL); - - t0 = mach_absolute_time(); - BOOL eok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e); - double ems1 = tb_ms(mach_absolute_time() - t0); - - if (!eok) { - printf("%4u %8.1fms %8.1fms %10s %10s EVAL_FAIL\n", qos, cms, lms, "-", "-"); - } else { - t0 = mach_absolute_time(); - for (int i = 0; i < 10; i++) { - ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e); - } - double ems_avg = tb_ms(mach_absolute_time() - t0) / 10.0; - printf("%4u %8.1fms %8.1fms %8.2fms %8.2fms OK\n", qos, cms, lms, ems1, ems_avg); - } - - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - CFRelease(ioIn); CFRelease(ioOut); - [fm removeItemAtPath:td error:nil]; - } - - printf("\nDone.\n"); - } - return 0; -} +// test_qos_sweep.m — Does QoS affect frequency/latency? +// Sweep QoS 0-63 on compile, load, eval of a working kernel. +#import +#import +#import +#import +#import +#import +#include "ane_compat.h" + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + ane_detect_platform(); + ane_print_platform(); + + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + // 256x256 conv, spatial=64 for measurable latency + int CH = 256, SP = 64; + int ws = CH*CH*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot, 1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + _Float16 *wp = (_Float16*)(blob+128); + for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50)); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + + NSString *mil = [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" + " } -> (y);\n" + "}\n", g_ane_platform.mil_program, ane_mil_target(), CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + + NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}; + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + NSFileManager *fm = [NSFileManager defaultManager]; + + printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n"); + printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6); + printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status"); + + unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63}; + int n_qos = sizeof(qos_values)/sizeof(qos_values[0]); + + for (int qi = 0; qi < n_qos; qi++) { + unsigned int qos = qos_values[qi]; + NSError *e = nil; + + // Make unique weights per iteration so hex differs + _Float16 *wq = (_Float16*)(blob+128); + wq[0] = (_Float16)(0.001f * qi); + NSData *wdata_q = [NSData dataWithBytes:blob length:tot]; + NSDictionary *weights_q = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata_q}}; + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), + milData, weights_q, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata_q writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + uint64_t t0 = mach_absolute_time(); + BOOL cok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), qos, @{}, &e); + double cms = tb_ms(mach_absolute_time() - t0); + + if (!cok) { + printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-"); + [fm removeItemAtPath:td error:nil]; + continue; + } + + t0 = mach_absolute_time(); + BOOL lok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), qos, @{}, &e); + double lms = tb_ms(mach_absolute_time() - t0); + + if (!lok) { + printf("%4u %8.1fms %10s %10s %10s LOAD_FAIL\n", qos, cms, "-", "-", "-"); + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [fm removeItemAtPath:td error:nil]; + continue; + } + + int ioBytes = CH * SP * 4; + IOSurfaceRef ioIn = make_surface(ioBytes); + IOSurfaceRef ioOut = make_surface(ioBytes); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + IOSurfaceLock(ioIn, 0, NULL); + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f; + IOSurfaceUnlock(ioIn, 0, NULL); + + t0 = mach_absolute_time(); + BOOL eok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e); + double ems1 = tb_ms(mach_absolute_time() - t0); + + if (!eok) { + printf("%4u %8.1fms %8.1fms %10s %10s EVAL_FAIL\n", qos, cms, lms, "-", "-"); + } else { + t0 = mach_absolute_time(); + for (int i = 0; i < 10; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), qos, @{}, req, &e); + } + double ems_avg = tb_ms(mach_absolute_time() - t0) / 10.0; + printf("%4u %8.1fms %8.1fms %8.2fms %8.2fms OK\n", qos, cms, lms, ems1, ems_avg); + } + + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(ioIn); CFRelease(ioOut); + [fm removeItemAtPath:td error:nil]; + } + + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_weight_reload.m b/training/test_weight_reload.m index a248005..cb3fca1 100644 --- a/training/test_weight_reload.m +++ b/training/test_weight_reload.m @@ -1,253 +1,256 @@ -// test_weight_reload.m — Can we skip recompilation by rewriting weight blobs on disk? -// Compile a conv kernel with weights A, eval, verify output. -// Overwrite weights/weight.bin in tmpDir with weights B. -// unloadWithQoS: then loadWithQoS: (no recompile). -// Eval again — if output matches B @ x, compilation bottleneck is eliminated. -#import -#import -#import -#import -#import -#import -#include - -static mach_timebase_info_data_t g_tb; -static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } - -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -// Build weight blob matching inmem_peak format (single chunk) -static NSData *build_weight_blob(_Float16 *w, int rows, int cols) { - int ws = rows * cols * 2; - int tot = 128 + ws; - uint8_t *b = (uint8_t*)calloc(tot, 1); - b[0] = 1; b[4] = 2; - b[64] = 0xEF; b[65] = 0xBE; b[66] = 0xAD; b[67] = 0xDE; b[68] = 1; - *(uint32_t*)(b+72) = ws; - *(uint32_t*)(b+80) = 128; - memcpy(b + 128, w, ws); - return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES]; -} - -// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out -static NSString *gen_mil(int ch, int sp) { - return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; -} - -int main() { - @autoreleasepool { - setbuf(stdout, NULL); - mach_timebase_info(&g_tb); - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - - Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - Class g_I = NSClassFromString(@"_ANEInMemoryModel"); - Class g_AR = NSClassFromString(@"_ANERequest"); - Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); - - if (!g_D || !g_I || !g_AR || !g_AIO) { - printf("FAIL: ANE classes not found\n"); - return 1; - } - - // Use 64-channel conv, spatial=32 (known to work on ANE) - int CH = 64, SP = 32; - - // Weight set A: scaled identity (1.0 on diagonal) - _Float16 *weightsA = (_Float16*)calloc(CH*CH, sizeof(_Float16)); - for (int i = 0; i < CH; i++) weightsA[i*CH+i] = (_Float16)1.0f; - - // Weight set B: 3x identity - _Float16 *weightsB = (_Float16*)calloc(CH*CH, sizeof(_Float16)); - for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f; - - NSData *wdataA = build_weight_blob(weightsA, CH, CH); - NSString *mil = gen_mil(CH, SP); - NSDictionary *weights = @{ - @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA} - }; - NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; - - // === Compile with weights A === - printf("=== Step 1: Compile with weights A (identity) ===\n"); - printf(" Kernel: %dx%d conv, spatial=%d\n", CH, CH, SP); - uint64_t t0 = mach_absolute_time(); - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, weights, nil); - if (!desc) { printf("FAIL: desc=NULL\n"); return 1; } - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; - [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; - [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - - NSError *e = nil; - BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); - if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; } - ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; } - double compile_ms = tb_ms(mach_absolute_time() - t0); - printf(" Compile+load: %.1fms\n", compile_ms); - printf(" tmpDir: %s\n", [td UTF8String]); - - // Build request and IOSurfaces (fp32 I/O) - int inBytes = CH * SP * 4; // fp32 - int outBytes = CH * SP * 4; - IOSurfaceRef ioIn = make_surface(inBytes); - IOSurfaceRef ioOut = make_surface(outBytes); - id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - - // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01 - IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) - for (int s = 0; s < SP; s++) - inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; - IOSurfaceUnlock(ioIn, 0, NULL); - - // Eval with weights A - printf("\n=== Step 2: Eval with weights A ===\n"); - ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } - - IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outA = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]); - printf(" Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, - outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]); - // Save copy - float *outA_copy = (float*)malloc(outBytes); - memcpy(outA_copy, outA, outBytes); - IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); - - // === Step 3: Overwrite weight file with B, unload+load === - printf("\n=== Step 3: Overwrite weight.bin with B (3x identity), unload+load ===\n"); - NSData *wdataB = build_weight_blob(weightsB, CH, CH); - NSString *weightPath = [td stringByAppendingPathComponent:@"weights/weight.bin"]; - [wdataB writeToFile:weightPath atomically:YES]; - printf(" Wrote new weight.bin\n"); - - // Unload - t0 = mach_absolute_time(); - ok = ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - double unload_ms = tb_ms(mach_absolute_time() - t0); - printf(" Unload: %s (%.2fms)\n", ok ? "OK" : "FAIL", unload_ms); - - // Reload (no compile!) - t0 = mach_absolute_time(); - ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - double reload_ms = tb_ms(mach_absolute_time() - t0); - printf(" Load (no recompile): %s (%.2fms)\n", ok ? "OK" : [[e description] UTF8String], reload_ms); - - if (!ok) { - printf("\n*** Load-after-overwrite FAILED — trying recompile+load ***\n"); - t0 = mach_absolute_time(); - ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); - printf(" Re-compile: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0)); - t0 = mach_absolute_time(); - ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - printf(" Re-load: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0)); - } - - // Build new request (re-use same surfaces) - wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); - wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); - req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - - // Re-write same input - IOSurfaceLock(ioIn, 0, NULL); - inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) - for (int s = 0; s < SP; s++) - inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; - IOSurfaceUnlock(ioIn, 0, NULL); - - // Eval with (possibly reloaded) weights B - printf("\n=== Step 4: Eval after reload ===\n"); - ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } - - IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outB = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]); - printf(" Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, - outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]); - - // Check: did the output change? - bool changed = false; - float max_diff = 0; - for (int i = 0; i < CH*SP; i++) { - float d = fabsf(outB[i] - outA_copy[i]); - if (d > max_diff) max_diff = d; - if (d > 0.001f) changed = true; - } - // Expected: output B should be 3x output A - bool correct_3x = true; - float max_3x_err = 0; - for (int i = 0; i < CH*SP; i++) { - float expected = outA_copy[i] * 3.0f; - float err = fabsf(outB[i] - expected); - if (err > max_3x_err) max_3x_err = err; - if (err > 0.1f) correct_3x = false; - } - IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); - - printf("\n=== RESULT ===\n"); - printf(" Max A-B diff: %.6f\n", max_diff); - printf(" Max 3x error: %.6f\n", max_3x_err); - printf(" Compile+load: %.1fms | Unload: %.1fms | Reload: %.1fms\n", compile_ms, unload_ms, reload_ms); - - if (changed && correct_3x) { - printf("\nSUCCESS: Weight reload works! Output matches 3x identity.\n"); - printf(" Speedup: compile=%.1fms vs reload=%.1fms (%.1fx faster)\n", - compile_ms, unload_ms + reload_ms, compile_ms / (unload_ms + reload_ms)); - printf(">>> Compilation bottleneck can be eliminated <<<\n"); - } else if (changed && !correct_3x) { - printf("\nPARTIAL: Output changed but doesn't match expected 3x.\n"); - } else { - printf("\nFAIL: Output did NOT change. Weight reload does not work.\n"); - printf(" ANE cached the compiled model — weights baked at compile time.\n"); - printf(">>> Need alternative: weightsBuffer IOSurface or async recompile <<<\n"); - } - - // Cleanup - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - [fm removeItemAtPath:td error:nil]; - CFRelease(ioIn); CFRelease(ioOut); - free(outA_copy); free(weightsA); free(weightsB); - } - return 0; -} +// test_weight_reload.m — Can we skip recompilation by rewriting weight blobs on disk? +// Compile a conv kernel with weights A, eval, verify output. +// Overwrite weights/weight.bin in tmpDir with weights B. +// unloadWithQoS: then loadWithQoS: (no recompile). +// Eval again — if output matches B @ x, compilation bottleneck is eliminated. +#import +#import +#import +#import +#import +#import +#include +#include "ane_compat.h" + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +// Build weight blob matching inmem_peak format (single chunk) +static NSData *build_weight_blob(_Float16 *w, int rows, int cols) { + int ws = rows * cols * 2; + int tot = 128 + ws; + uint8_t *b = (uint8_t*)calloc(tot, 1); + b[0] = 1; b[4] = 2; + b[64] = 0xEF; b[65] = 0xBE; b[66] = 0xAD; b[67] = 0xDE; b[68] = 1; + *(uint32_t*)(b+72) = ws; + *(uint32_t*)(b+80) = 128; + memcpy(b + 128, w, ws); + return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES]; +} + +// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out +static NSString *gen_mil(int ch, int sp) { + return [NSString stringWithFormat: + @"program(%s)\n" + "[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n" + "{\n" + " func main<%s>(tensor x) {\n" + " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" + " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" + " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" + " tensor W = const()[name=string(\"W\"), " + "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=string(\"conv\")];\n" + " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" + " } -> (y);\n" + "}\n", g_ane_platform.mil_program, ane_mil_target(), ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; +} + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + ane_detect_platform(); + ane_print_platform(); + + Class g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class g_I = NSClassFromString(@"_ANEInMemoryModel"); + Class g_AR = NSClassFromString(@"_ANERequest"); + Class g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); + + if (!g_D || !g_I || !g_AR || !g_AIO) { + printf("FAIL: ANE classes not found\n"); + return 1; + } + + // Use 64-channel conv, spatial=32 (known to work on ANE) + int CH = 64, SP = 32; + + // Weight set A: scaled identity (1.0 on diagonal) + _Float16 *weightsA = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) weightsA[i*CH+i] = (_Float16)1.0f; + + // Weight set B: 3x identity + _Float16 *weightsB = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f; + + NSData *wdataA = build_weight_blob(weightsA, CH, CH); + NSString *mil = gen_mil(CH, SP); + NSDictionary *weights = @{ + @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA} + }; + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + + // === Compile with weights A === + printf("=== Step 1: Compile with weights A (identity) ===\n"); + printf(" Kernel: %dx%d conv, spatial=%d\n", CH, CH, SP); + uint64_t t0 = mach_absolute_time(); + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, weights, nil); + if (!desc) { printf("FAIL: desc=NULL\n"); return 1; } + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; } + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; } + double compile_ms = tb_ms(mach_absolute_time() - t0); + printf(" Compile+load: %.1fms\n", compile_ms); + printf(" tmpDir: %s\n", [td UTF8String]); + + // Build request and IOSurfaces (fp32 I/O) + int inBytes = CH * SP * 4; // fp32 + int outBytes = CH * SP * 4; + IOSurfaceRef ioIn = make_surface(inBytes); + IOSurfaceRef ioOut = make_surface(outBytes); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01 + IOSurfaceLock(ioIn, 0, NULL); + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + IOSurfaceUnlock(ioIn, 0, NULL); + + // Eval with weights A + printf("\n=== Step 2: Eval with weights A ===\n"); + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } + + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + float *outA = (float*)IOSurfaceGetBaseAddress(ioOut); + printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]); + printf(" Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, + outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]); + // Save copy + float *outA_copy = (float*)malloc(outBytes); + memcpy(outA_copy, outA, outBytes); + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + + // === Step 3: Overwrite weight file with B, unload+load === + printf("\n=== Step 3: Overwrite weight.bin with B (3x identity), unload+load ===\n"); + NSData *wdataB = build_weight_blob(weightsB, CH, CH); + NSString *weightPath = [td stringByAppendingPathComponent:@"weights/weight.bin"]; + [wdataB writeToFile:weightPath atomically:YES]; + printf(" Wrote new weight.bin\n"); + + // Unload + t0 = mach_absolute_time(); + ok = ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + double unload_ms = tb_ms(mach_absolute_time() - t0); + printf(" Unload: %s (%.2fms)\n", ok ? "OK" : "FAIL", unload_ms); + + // Reload (no compile!) + t0 = mach_absolute_time(); + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + double reload_ms = tb_ms(mach_absolute_time() - t0); + printf(" Load (no recompile): %s (%.2fms)\n", ok ? "OK" : [[e description] UTF8String], reload_ms); + + if (!ok) { + printf("\n*** Load-after-overwrite FAILED — trying recompile+load ***\n"); + t0 = mach_absolute_time(); + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + printf(" Re-compile: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0)); + t0 = mach_absolute_time(); + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + printf(" Re-load: %s (%.2fms)\n", ok ? "OK" : "FAIL", tb_ms(mach_absolute_time() - t0)); + } + + // Build new request (re-use same surfaces) + wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); + wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioOut); + req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + // Re-write same input + IOSurfaceLock(ioIn, 0, NULL); + inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + IOSurfaceUnlock(ioIn, 0, NULL); + + // Eval with (possibly reloaded) weights B + printf("\n=== Step 4: Eval after reload ===\n"); + ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } + + IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); + float *outB = (float*)IOSurfaceGetBaseAddress(ioOut); + printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]); + printf(" Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, + outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]); + + // Check: did the output change? + bool changed = false; + float max_diff = 0; + for (int i = 0; i < CH*SP; i++) { + float d = fabsf(outB[i] - outA_copy[i]); + if (d > max_diff) max_diff = d; + if (d > 0.001f) changed = true; + } + // Expected: output B should be 3x output A + bool correct_3x = true; + float max_3x_err = 0; + for (int i = 0; i < CH*SP; i++) { + float expected = outA_copy[i] * 3.0f; + float err = fabsf(outB[i] - expected); + if (err > max_3x_err) max_3x_err = err; + if (err > 0.1f) correct_3x = false; + } + IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + + printf("\n=== RESULT ===\n"); + printf(" Max A-B diff: %.6f\n", max_diff); + printf(" Max 3x error: %.6f\n", max_3x_err); + printf(" Compile+load: %.1fms | Unload: %.1fms | Reload: %.1fms\n", compile_ms, unload_ms, reload_ms); + + if (changed && correct_3x) { + printf("\nSUCCESS: Weight reload works! Output matches 3x identity.\n"); + printf(" Speedup: compile=%.1fms vs reload=%.1fms (%.1fx faster)\n", + compile_ms, unload_ms + reload_ms, compile_ms / (unload_ms + reload_ms)); + printf(">>> Compilation bottleneck can be eliminated <<<\n"); + } else if (changed && !correct_3x) { + printf("\nPARTIAL: Output changed but doesn't match expected 3x.\n"); + } else { + printf("\nFAIL: Output did NOT change. Weight reload does not work.\n"); + printf(" ANE cached the compiled model — weights baked at compile time.\n"); + printf(">>> Need alternative: weightsBuffer IOSurface or async recompile <<<\n"); + } + + // Cleanup + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + [fm removeItemAtPath:td error:nil]; + CFRelease(ioIn); CFRelease(ioOut); + free(outA_copy); free(weightsA); free(weightsB); + } + return 0; +} diff --git a/training/tiny_train.m b/training/tiny_train.m index e1e9d7d..ba90951 100644 --- a/training/tiny_train.m +++ b/training/tiny_train.m @@ -1,593 +1,597 @@ -// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward) -// y = W2 @ relu(W1 @ x), MSE loss, SGD update -// Pipeline: compile next kernels on background thread while ANE runs current batch -// Bypasses ANE 119-compile limit via exec() self-restart -#import -#import -#import -#import -#import -#import -#include -#include -#include - -static Class g_D, g_I, g_AR, g_AIO; - -static void ane_init(void) { - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - g_I = NSClassFromString(@"_ANEInMemoryModel"); - g_AR = NSClassFromString(@"_ANERequest"); - g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); -} - -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -static NSData *build_blob(const float *w, int rows, int cols) { - int wsize = rows * cols * 2; - int total = 128 + wsize; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0] = 0x01; buf[4] = 0x02; - buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; - buf[68] = 0x01; - *(uint32_t*)(buf+72) = wsize; - *(uint32_t*)(buf+80) = 128; - _Float16 *fp16 = (_Float16*)(buf + 128); - for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i]; - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -static NSData *build_blob_transposed(const float *w, int rows, int cols) { - int wsize = cols * rows * 2; - int total = 128 + wsize; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0] = 0x01; buf[4] = 0x02; - buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; - buf[68] = 0x01; - *(uint32_t*)(buf+72) = wsize; - *(uint32_t*)(buf+80) = 128; - _Float16 *fp16 = (_Float16*)(buf + 128); - for (int i = 0; i < rows; i++) - for (int j = 0; j < cols; j++) - fp16[j * rows + i] = (_Float16)w[i * cols + j]; - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { - return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" - " } -> (y);\n}\n", - in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; -} - -typedef struct { - void *model; // CFBridgingRetain'd _ANEInMemoryModel - IOSurfaceRef ioIn, ioOut; - void *request; // CFBridgingRetain'd _ANERequest - void *tmpDir; // CFBridgingRetain'd NSString -} Kern; - -static int g_compile_count = 0; - -static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { - @autoreleasepool { - NSString *mil = gen_conv_mil(in_ch, out_ch, sp); - NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; - NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}}; - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil); - if (!desc) return NULL; - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; - [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; - [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; - __sync_fetch_and_add(&g_compile_count, 1); - size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; - IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); - id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - Kern *k = calloc(1, sizeof(Kern)); - k->model = CFBridgingRetain(mdl); - k->ioIn = ioI; k->ioOut = ioO; - k->request = CFBridgingRetain(req); - k->tmpDir = CFBridgingRetain(td); - return k; - } -} - -static void free_kern(Kern *k) { - if (!k) return; - id mdl = (__bridge id)k->model; - NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); - CFRelease(k->ioIn); CFRelease(k->ioOut); - NSString *td = (__bridge id)k->tmpDir; - [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; - CFRelease(k->model); - CFRelease(k->request); - CFRelease(k->tmpDir); - free(k); -} - -static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { - float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); - for (int t = 0; t < sp; t++) - for (int c = 0; c < in_ch; c++) - tmp[c*sp + t] = in[t*in_ch + c]; - IOSurfaceLock(k->ioIn, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); - IOSurfaceUnlock(k->ioIn, 0, NULL); - free(tmp); - NSError *e = nil; - id mdl = (__bridge id)k->model; - id req = (__bridge id)k->request; - ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); - IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); - IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - for (int t = 0; t < sp; t++) - for (int c = 0; c < out_ch; c++) - out[t*out_ch + c] = tmp2[c*sp + t]; - free(tmp2); -} - -// === Checkpoint: save/restore training state for exec() restart === -#define CKPT_PATH "/tmp/ane_train_ckpt.bin" - -typedef struct { - int step; - float loss; - int D, H, S, total_steps; - float lr; - double cum_compile_ms, cum_train_ms, cum_wall_ms; - int cum_steps, cum_batches; -} CkptHeader; - -static void save_checkpoint(const char *path, int step, float loss, - int D, int H, int S, int total_steps, float lr, - const float *W1, const float *W2, - double cc, double ct, double cw, int cs, int cb) { - FILE *f = fopen(path, "wb"); - CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb}; - fwrite(&hdr, sizeof(hdr), 1, f); - fwrite(W1, sizeof(float), H * D, f); - fwrite(W2, sizeof(float), D * H, f); - fclose(f); -} - -static bool load_checkpoint(const char *path, CkptHeader *hdr, - float *W1, float *W2, int H, int D) { - FILE *f = fopen(path, "rb"); - if (!f) return false; - fread(hdr, sizeof(CkptHeader), 1, f); - fread(W1, sizeof(float), H * D, f); - fread(W2, sizeof(float), D * H, f); - fclose(f); - return true; -} - -#define MAX_COMPILES 100 -#define KERNELS_PER_STEP 4 -#define ACCUM_STEPS 10 - -// === Pipeline: background compile via GCD === -typedef struct { - Kern *k1_fwd, *k2_fwd, *k1_bwd, *k2_bwd; - float *W1, *W2; - int D, H, S; - bool ok; - double compile_ms; -} PipelineCompile; - -static double tb_to_ms(uint64_t elapsed, mach_timebase_info_data_t tb) { - return (double)elapsed * tb.numer / tb.denom / 1e6; -} - -static mach_timebase_info_data_t g_tb; -// Serial queue ensures ANE compiles don't overlap with each other -static dispatch_queue_t g_compile_queue; - -int main(int argc, char *argv[]) { - @autoreleasepool { - setbuf(stdout, NULL); - ane_init(); - mach_timebase_info(&g_tb); - g_compile_queue = dispatch_queue_create("ane.compile", DISPATCH_QUEUE_SERIAL); - - int D = 64, H = 128, S = 16; - int total_steps = 2000; - float lr = 1.0f; - int start_step = 0; - bool resuming = false; - - float *W1 = (float*)malloc(H * D * sizeof(float)); - float *W2 = (float*)malloc(D * H * sizeof(float)); - - if (argc > 1 && strcmp(argv[1], "--resume") == 0) { - CkptHeader hdr; - if (load_checkpoint(CKPT_PATH, &hdr, W1, W2, H, D)) { - start_step = hdr.step; - total_steps = hdr.total_steps; - lr = hdr.lr; - resuming = true; - printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss); - } - } - - // Cumulative stats (restored from checkpoint if resuming) - double cum_compile_ms = 0, cum_train_ms = 0, cum_wall_ms = 0; - int cum_steps = 0, cum_batches = 0; - if (resuming) { - CkptHeader hdr2; - FILE *f = fopen(CKPT_PATH, "rb"); - if (f) { fread(&hdr2, sizeof(hdr2), 1, f); fclose(f); - cum_compile_ms = hdr2.cum_compile_ms; - cum_train_ms = hdr2.cum_train_ms; - cum_wall_ms = hdr2.cum_wall_ms; - cum_steps = hdr2.cum_steps; - cum_batches = hdr2.cum_batches; - } - } - - // FLOPs calculation - // Forward: W1[H,D] @ x[D,S] = 2*H*D*S, W2[D,H] @ h[H,S] = 2*D*H*S → total fwd = 4*D*H*S - // Backward dx: W2^T[H,D] @ dy[D,S] = 2*H*D*S, W1^T[D,H] @ dh[H,S] = 2*D*H*S → total bwd = 4*D*H*S - // dW (CPU): dW2[D,H] = dy[D,S] @ h^T[S,H] = 2*D*S*H, dW1 same → total dW = 4*D*H*S - // ANE FLOPs per step = 8*D*H*S (fwd + bwd on ANE) - // CPU FLOPs per step = 4*D*H*S (dW accumulation) - // Total FLOPs per step = 12*D*H*S - double ane_flops_per_step = 8.0 * D * H * S; - double cpu_flops_per_step = 4.0 * D * H * S; - double total_flops_per_step = ane_flops_per_step + cpu_flops_per_step; - double weight_bytes = (H*D + D*H) * 2.0; // FP16 weights on ANE - - if (!resuming) { - for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f); - for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); - printf("=== ANE Training: Pipeline Parallel + Grad Accumulation ===\n"); - printf("x:[%d,%d] -> W1:[%d,%d] -> ReLU -> W2:[%d,%d] -> y:[%d,%d]\n", S,D, H,D, D,H, S,D); - printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", ACCUM_STEPS); - printf("ANE FP16 peak: 15.8 TFLOPS (M4) | Weights: %.1f KB\n\n", weight_bytes/1024.0); - printf("FLOPs/step: ANE=%.0f (fwd+bwd) CPU=%.0f (dW) Total=%.0f\n", - ane_flops_per_step, cpu_flops_per_step, total_flops_per_step); - printf("Steps: %d, LR: %.4f, exec() budget: %d compiles\n\n", - total_steps, lr, MAX_COMPILES); - } - - float *x = (float*)calloc(S * D, sizeof(float)); - float *y_target = (float*)calloc(S * D, sizeof(float)); - for (int t = 0; t < S; t++) - for (int i = 0; i < D; i++) { - float v = sinf((t * D + i) * 0.1f); - x[t*D + i] = v; - y_target[t*D + i] = v; - } - - float *h = (float*)malloc(S * H * sizeof(float)); - float *h_relu = (float*)malloc(S * H * sizeof(float)); - float *y = (float*)malloc(S * D * sizeof(float)); - float *dy = (float*)malloc(S * D * sizeof(float)); - float *dh_relu = (float*)malloc(S * H * sizeof(float)); - float *dh = (float*)malloc(S * H * sizeof(float)); - float *dx_layer = (float*)malloc(S * D * sizeof(float)); - - Kern *k1_fwd = NULL, *k2_fwd = NULL; - Kern *k1_bwd = NULL, *k2_bwd = NULL; - float last_loss = 999.0f; - - // Stats - double total_compile_ms = 0, total_train_ms = 0, total_wall_ms = 0; - double total_hidden_compile_ms = 0; // compile time hidden by pipeline - int total_batches = 0; - int total_steps_done = 0; - uint64_t t_wall_start = mach_absolute_time(); - - // First compile is synchronous (no pipeline yet) - { - uint64_t t0 = mach_absolute_time(); - k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); - k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); - k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); - k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); - double cms = tb_to_ms(mach_absolute_time() - t0, g_tb); - total_compile_ms += cms; - if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { - printf("Initial compile failed!\n"); return 1; - } - printf("Initial compile: %.0fms\n", cms); - } - - int step = start_step; - while (step < total_steps) { - // Check compile budget - if (g_compile_count + KERNELS_PER_STEP > MAX_COMPILES) { - free_kern(k1_fwd); free_kern(k2_fwd); - free_kern(k1_bwd); free_kern(k2_bwd); - save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2, - cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms, - cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb), - cum_steps + total_steps_done, cum_batches + total_batches); - double wall = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb); - printf("[exec() restart at step %d, %d compiles, loss=%.6f, wall=%.0fms]\n", - step, g_compile_count, last_loss, wall); - fflush(stdout); - execl(argv[0], argv[0], "--resume", NULL); - perror("execl failed"); return 1; - } - - // === Run ACCUM_STEPS with current kernels === - float *aW1 = (float*)calloc(H * D, sizeof(float)); - float *aW2 = (float*)calloc(D * H, sizeof(float)); - int steps_this_batch = 0; - - // Pipeline: start compiling NEXT batch's kernels in background - // We'll apply gradients first, then launch compile with updated W - // But for pipeline, we compile AHEAD: while running batch N, compile for N+1 - // So we need to update weights BEFORE launching background compile - - uint64_t t_batch = mach_absolute_time(); - for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { - ane_eval_k(k1_fwd, x, h, D, H, S); - for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; - ane_eval_k(k2_fwd, h_relu, y, H, D, S); - - float loss = 0; - for (int i = 0; i < S*D; i++) { - float diff = y[i] - y_target[i]; - loss += diff * diff; - dy[i] = 2.0f * diff / (S * D); - } - loss /= (S * D); - last_loss = loss; - - ane_eval_k(k2_bwd, dy, dh_relu, D, H, S); - for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; - ane_eval_k(k1_bwd, dh, dx_layer, H, D, S); - - for (int t = 0; t < S; t++) - for (int i = 0; i < D; i++) - for (int j = 0; j < H; j++) - aW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; - for (int t = 0; t < S; t++) - for (int i = 0; i < H; i++) - for (int j = 0; j < D; j++) - aW1[i*D + j] += dh[t*H + i] * x[t*D + j]; - - steps_this_batch++; - } - double batch_ms = tb_to_ms(mach_absolute_time() - t_batch, g_tb); - total_train_ms += batch_ms; - - // Apply accumulated gradients - float scale = 1.0f / steps_this_batch; - for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1[i] * scale; - for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2[i] * scale; - free(aW1); free(aW2); - - total_steps_done += steps_this_batch; - total_batches++; - - // Print progress - double step_ms = batch_ms / steps_this_batch; - double ane_gflops = (ane_flops_per_step * steps_this_batch) / (batch_ms * 1e6); - double total_gflops = (total_flops_per_step * steps_this_batch) / (batch_ms * 1e6); - - if (total_batches % 5 == 1 || total_batches <= 2 || step >= total_steps) { - printf("step %-5d loss=%-10.6f %5.1fms/step ANE=%.2f GFLOPS total=%.2f GFLOPS compiles=%d\n", - step - steps_this_batch, last_loss, step_ms, ane_gflops, total_gflops, g_compile_count); - } - - // Pipeline: launch background compile with updated weights, - // then immediately start NEXT batch's ANE evals with OLD kernels - // while compile runs concurrently on GCD queue - bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= MAX_COMPILES); - - if (can_pipeline) { - // Snapshot weights for background compile - PipelineCompile *pc = calloc(1, sizeof(PipelineCompile)); - pc->W1 = (float*)malloc(H * D * sizeof(float)); - pc->W2 = (float*)malloc(D * H * sizeof(float)); - memcpy(pc->W1, W1, H * D * sizeof(float)); - memcpy(pc->W2, W2, D * H * sizeof(float)); - pc->D = D; pc->H = H; pc->S = S; - - dispatch_semaphore_t sem = dispatch_semaphore_create(0); - - dispatch_async(g_compile_queue, ^{ - @autoreleasepool { - uint64_t t0 = mach_absolute_time(); - pc->k1_fwd = compile_kern_with_blob(build_blob(pc->W1, pc->H, pc->D), pc->D, pc->H, pc->S); - pc->k2_fwd = compile_kern_with_blob(build_blob(pc->W2, pc->D, pc->H), pc->H, pc->D, pc->S); - pc->k2_bwd = compile_kern_with_blob(build_blob_transposed(pc->W2, pc->D, pc->H), pc->D, pc->H, pc->S); - pc->k1_bwd = compile_kern_with_blob(build_blob_transposed(pc->W1, pc->H, pc->D), pc->H, pc->D, pc->S); - pc->compile_ms = tb_to_ms(mach_absolute_time() - t0, g_tb); - pc->ok = pc->k1_fwd && pc->k2_fwd && pc->k1_bwd && pc->k2_bwd; - dispatch_semaphore_signal(sem); - } - }); - - // === While compile runs in background, do ANOTHER batch with OLD kernels === - if (step < total_steps && k1_fwd && k2_fwd && k1_bwd && k2_bwd) { - float *aW1b = (float*)calloc(H * D, sizeof(float)); - float *aW2b = (float*)calloc(D * H, sizeof(float)); - int steps_overlap = 0; - uint64_t t_overlap = mach_absolute_time(); - - for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { - ane_eval_k(k1_fwd, x, h, D, H, S); - for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; - ane_eval_k(k2_fwd, h_relu, y, H, D, S); - - float loss = 0; - for (int i = 0; i < S*D; i++) { - float diff = y[i] - y_target[i]; - loss += diff * diff; - dy[i] = 2.0f * diff / (S * D); - } - loss /= (S * D); - last_loss = loss; - - ane_eval_k(k2_bwd, dy, dh_relu, D, H, S); - for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; - ane_eval_k(k1_bwd, dh, dx_layer, H, D, S); - - for (int t = 0; t < S; t++) - for (int i = 0; i < D; i++) - for (int j = 0; j < H; j++) - aW2b[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; - for (int t = 0; t < S; t++) - for (int i = 0; i < H; i++) - for (int j = 0; j < D; j++) - aW1b[i*D + j] += dh[t*H + i] * x[t*D + j]; - steps_overlap++; - } - double overlap_ms = tb_to_ms(mach_absolute_time() - t_overlap, g_tb); - total_train_ms += overlap_ms; - total_steps_done += steps_overlap; - total_batches++; - - // Apply these gradients with reduced LR (stale weights — 1 batch behind) - float sc = 0.5f / steps_overlap; // half LR for stale batch - for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1b[i] * sc; - for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2b[i] * sc; - free(aW1b); free(aW2b); - - if (total_batches % 5 == 1) { - double sm = overlap_ms / steps_overlap; - printf("step %-5d loss=%-10.6f %5.1fms/step (overlapped with compile) compiles=%d\n", - step - steps_overlap, last_loss, sm, g_compile_count); - } - } - - // Wait for compile to finish - dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER); - total_compile_ms += pc->compile_ms; - total_hidden_compile_ms += pc->compile_ms; // all hidden behind train - - free_kern(k1_fwd); free_kern(k2_fwd); - free_kern(k1_bwd); free_kern(k2_bwd); - - if (pc->ok) { - k1_fwd = pc->k1_fwd; k2_fwd = pc->k2_fwd; - k1_bwd = pc->k1_bwd; k2_bwd = pc->k2_bwd; - } else { - k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; - } - free(pc->W1); free(pc->W2); free(pc); - } else if (step < total_steps) { - // Synchronous compile (no budget for pipeline) - uint64_t t0 = mach_absolute_time(); - free_kern(k1_fwd); free_kern(k2_fwd); - free_kern(k1_bwd); free_kern(k2_bwd); - k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); - k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); - k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); - k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); - double cms = tb_to_ms(mach_absolute_time() - t0, g_tb); - total_compile_ms += cms; - if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { - save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2, - cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms, - cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb), - cum_steps + total_steps_done, cum_batches + total_batches); - fflush(stdout); - execl(argv[0], argv[0], "--resume", NULL); - perror("execl failed"); return 1; - } - } - - if (last_loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; } - } - - total_wall_ms = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb); - // Add cumulative from previous exec() runs - total_compile_ms += cum_compile_ms; - total_train_ms += cum_train_ms; - total_wall_ms += cum_wall_ms; - total_steps_done += cum_steps; - total_batches += cum_batches; - - // === Final output === - printf("\nFinal output vs target (first 8):\n"); - if (k1_fwd && k2_fwd) { - ane_eval_k(k1_fwd, x, h, D, H, S); - for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; - ane_eval_k(k2_fwd, h_relu, y, H, D, S); - } - printf(" y: "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n"); - printf(" target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n"); - - // === Efficiency Report === - printf("\n=== Efficiency Report ===\n"); - printf("Total steps: %d\n", total_steps_done); - printf("Total batches: %d (accum %d steps each)\n", total_batches, ACCUM_STEPS); - printf("Wall time: %.0f ms\n", total_wall_ms); - printf("Compile time: %.0f ms (%.1f%%)\n", total_compile_ms, 100.0*total_compile_ms/total_wall_ms); - printf("Train time: %.0f ms (%.1f%%)\n", total_train_ms, 100.0*total_train_ms/total_wall_ms); - printf("Overhead: %.0f ms (%.1f%%)\n", - total_wall_ms - total_compile_ms - total_train_ms, - 100.0*(total_wall_ms - total_compile_ms - total_train_ms)/total_wall_ms); - printf("\n"); - printf("Avg compile: %.1f ms per batch (4 kernels)\n", total_compile_ms / total_batches); - printf("Avg train: %.2f ms per step (ANE fwd+bwd + CPU dW)\n", total_train_ms / total_steps_done); - printf("Avg wall/step: %.2f ms\n", total_wall_ms / total_steps_done); - printf("\n"); - double ane_total_flops = ane_flops_per_step * total_steps_done; - double cpu_total_flops = cpu_flops_per_step * total_steps_done; - printf("ANE FLOPs total: %.3f MFLOP (%.2f GFLOPS sustained)\n", - ane_total_flops / 1e6, ane_total_flops / (total_train_ms * 1e6)); - printf("CPU FLOPs total: %.3f MFLOP (%.2f GFLOPS sustained)\n", - cpu_total_flops / 1e6, cpu_total_flops / (total_train_ms * 1e6)); - printf("Total FLOPs: %.3f MFLOP (%.2f GFLOPS sustained)\n", - (ane_total_flops + cpu_total_flops) / 1e6, - (ane_total_flops + cpu_total_flops) / (total_train_ms * 1e6)); - printf("\n"); - printf("ANE utilization: %.4f%% of 15.8 TFLOPS peak\n", - 100.0 * ane_total_flops / (total_train_ms * 1e6) / 15800.0); - printf("Weight params: %d (%.1f KB FP16)\n", - H*D + D*H, weight_bytes / 1024.0); - printf("Compile amortization: %.1f ms compile / %d steps = %.2f ms/step overhead\n", - total_compile_ms / total_batches, ACCUM_STEPS, - total_compile_ms / total_batches / ACCUM_STEPS); - printf("Compile fraction: %.1f%% of wall time\n", 100.0 * total_compile_ms / total_wall_ms); - printf("Train fraction: %.1f%% of wall time (useful work)\n", 100.0 * total_train_ms / total_wall_ms); - - free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); - free(W1); free(W2); free(x); free(y_target); - free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); - unlink(CKPT_PATH); - } - return 0; -} +// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward) +// y = W2 @ relu(W1 @ x), MSE loss, SGD update +// Pipeline: compile next kernels on background thread while ANE runs current batch +// Bypasses ANE 119-compile limit via exec() self-restart +#import +#import +#import +#import +#import +#import +#include +#include +#include "ane_compat.h" +#include + +static Class g_D, g_I, g_AR, g_AIO; + +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +static NSData *build_blob(const float *w, int rows, int cols) { + int wsize = rows * cols * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSData *build_blob_transposed(const float *w, int rows, int cols) { + int wsize = cols * rows * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + fp16[j * rows + i] = (_Float16)w[i * cols + j]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + return [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; +} + +typedef struct { + void *model; // CFBridgingRetain'd _ANEInMemoryModel + IOSurfaceRef ioIn, ioOut; + void *request; // CFBridgingRetain'd _ANERequest + void *tmpDir; // CFBridgingRetain'd NSString +} Kern; + +static int g_compile_count = 0; + +static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { + @autoreleasepool { + NSString *mil = gen_conv_mil(in_ch, out_ch, sp); + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}}; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil); + if (!desc) return NULL; + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; + __sync_fetch_and_add(&g_compile_count, 1); + size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + Kern *k = calloc(1, sizeof(Kern)); + k->model = CFBridgingRetain(mdl); + k->ioIn = ioI; k->ioOut = ioO; + k->request = CFBridgingRetain(req); + k->tmpDir = CFBridgingRetain(td); + return k; + } +} + +static void free_kern(Kern *k) { + if (!k) return; + id mdl = (__bridge id)k->model; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(k->ioIn); CFRelease(k->ioOut); + NSString *td = (__bridge id)k->tmpDir; + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(k->model); + CFRelease(k->request); + CFRelease(k->tmpDir); + free(k); +} + +static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { + float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + tmp[c*sp + t] = in[t*in_ch + c]; + IOSurfaceLock(k->ioIn, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioIn, 0, NULL); + free(tmp); + NSError *e = nil; + id mdl = (__bridge id)k->model; + id req = (__bridge id)k->request; + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); + float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = tmp2[c*sp + t]; + free(tmp2); +} + +// === Checkpoint: save/restore training state for exec() restart === +#define CKPT_PATH "/tmp/ane_train_ckpt.bin" + +typedef struct { + int step; + float loss; + int D, H, S, total_steps; + float lr; + double cum_compile_ms, cum_train_ms, cum_wall_ms; + int cum_steps, cum_batches; +} CkptHeader; + +static void save_checkpoint(const char *path, int step, float loss, + int D, int H, int S, int total_steps, float lr, + const float *W1, const float *W2, + double cc, double ct, double cw, int cs, int cb) { + FILE *f = fopen(path, "wb"); + CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb}; + fwrite(&hdr, sizeof(hdr), 1, f); + fwrite(W1, sizeof(float), H * D, f); + fwrite(W2, sizeof(float), D * H, f); + fclose(f); +} + +static bool load_checkpoint(const char *path, CkptHeader *hdr, + float *W1, float *W2, int H, int D) { + FILE *f = fopen(path, "rb"); + if (!f) return false; + fread(hdr, sizeof(CkptHeader), 1, f); + fread(W1, sizeof(float), H * D, f); + fread(W2, sizeof(float), D * H, f); + fclose(f); + return true; +} + +#define MAX_COMPILES 100 +#define KERNELS_PER_STEP 4 +#define ACCUM_STEPS 10 + +// === Pipeline: background compile via GCD === +typedef struct { + Kern *k1_fwd, *k2_fwd, *k1_bwd, *k2_bwd; + float *W1, *W2; + int D, H, S; + bool ok; + double compile_ms; +} PipelineCompile; + +static double tb_to_ms(uint64_t elapsed, mach_timebase_info_data_t tb) { + return (double)elapsed * tb.numer / tb.denom / 1e6; +} + +static mach_timebase_info_data_t g_tb; +// Serial queue ensures ANE compiles don't overlap with each other +static dispatch_queue_t g_compile_queue; + +int main(int argc, char *argv[]) { + @autoreleasepool { + setbuf(stdout, NULL); + ane_init(); + ane_detect_platform(); + ane_print_platform(); + mach_timebase_info(&g_tb); + g_compile_queue = dispatch_queue_create("ane.compile", DISPATCH_QUEUE_SERIAL); + + int D = 64, H = 128, S = 16; + int total_steps = 2000; + float lr = 1.0f; + int start_step = 0; + bool resuming = false; + + float *W1 = (float*)malloc(H * D * sizeof(float)); + float *W2 = (float*)malloc(D * H * sizeof(float)); + + if (argc > 1 && strcmp(argv[1], "--resume") == 0) { + CkptHeader hdr; + if (load_checkpoint(CKPT_PATH, &hdr, W1, W2, H, D)) { + start_step = hdr.step; + total_steps = hdr.total_steps; + lr = hdr.lr; + resuming = true; + printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss); + } + } + + // Cumulative stats (restored from checkpoint if resuming) + double cum_compile_ms = 0, cum_train_ms = 0, cum_wall_ms = 0; + int cum_steps = 0, cum_batches = 0; + if (resuming) { + CkptHeader hdr2; + FILE *f = fopen(CKPT_PATH, "rb"); + if (f) { fread(&hdr2, sizeof(hdr2), 1, f); fclose(f); + cum_compile_ms = hdr2.cum_compile_ms; + cum_train_ms = hdr2.cum_train_ms; + cum_wall_ms = hdr2.cum_wall_ms; + cum_steps = hdr2.cum_steps; + cum_batches = hdr2.cum_batches; + } + } + + // FLOPs calculation + // Forward: W1[H,D] @ x[D,S] = 2*H*D*S, W2[D,H] @ h[H,S] = 2*D*H*S → total fwd = 4*D*H*S + // Backward dx: W2^T[H,D] @ dy[D,S] = 2*H*D*S, W1^T[D,H] @ dh[H,S] = 2*D*H*S → total bwd = 4*D*H*S + // dW (CPU): dW2[D,H] = dy[D,S] @ h^T[S,H] = 2*D*S*H, dW1 same → total dW = 4*D*H*S + // ANE FLOPs per step = 8*D*H*S (fwd + bwd on ANE) + // CPU FLOPs per step = 4*D*H*S (dW accumulation) + // Total FLOPs per step = 12*D*H*S + double ane_flops_per_step = 8.0 * D * H * S; + double cpu_flops_per_step = 4.0 * D * H * S; + double total_flops_per_step = ane_flops_per_step + cpu_flops_per_step; + double weight_bytes = (H*D + D*H) * 2.0; // FP16 weights on ANE + + if (!resuming) { + for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f); + for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); + printf("=== ANE Training: Pipeline Parallel + Grad Accumulation ===\n"); + printf("x:[%d,%d] -> W1:[%d,%d] -> ReLU -> W2:[%d,%d] -> y:[%d,%d]\n", S,D, H,D, D,H, S,D); + printf("Accum %d steps per recompile | Pipeline: compile overlaps ANE eval\n", ACCUM_STEPS); + printf("ANE FP16 peak: %.1f TFLOPS (%s) | Weights: %.1f KB\n\n", ane_peak_tflops(), g_ane_platform.chip_name, weight_bytes/1024.0); + printf("FLOPs/step: ANE=%.0f (fwd+bwd) CPU=%.0f (dW) Total=%.0f\n", + ane_flops_per_step, cpu_flops_per_step, total_flops_per_step); + printf("Steps: %d, LR: %.4f, exec() budget: %d compiles\n\n", + total_steps, lr, MAX_COMPILES); + } + + float *x = (float*)calloc(S * D, sizeof(float)); + float *y_target = (float*)calloc(S * D, sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) { + float v = sinf((t * D + i) * 0.1f); + x[t*D + i] = v; + y_target[t*D + i] = v; + } + + float *h = (float*)malloc(S * H * sizeof(float)); + float *h_relu = (float*)malloc(S * H * sizeof(float)); + float *y = (float*)malloc(S * D * sizeof(float)); + float *dy = (float*)malloc(S * D * sizeof(float)); + float *dh_relu = (float*)malloc(S * H * sizeof(float)); + float *dh = (float*)malloc(S * H * sizeof(float)); + float *dx_layer = (float*)malloc(S * D * sizeof(float)); + + Kern *k1_fwd = NULL, *k2_fwd = NULL; + Kern *k1_bwd = NULL, *k2_bwd = NULL; + float last_loss = 999.0f; + + // Stats + double total_compile_ms = 0, total_train_ms = 0, total_wall_ms = 0; + double total_hidden_compile_ms = 0; // compile time hidden by pipeline + int total_batches = 0; + int total_steps_done = 0; + uint64_t t_wall_start = mach_absolute_time(); + + // First compile is synchronous (no pipeline yet) + { + uint64_t t0 = mach_absolute_time(); + k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); + k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); + k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); + k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); + double cms = tb_to_ms(mach_absolute_time() - t0, g_tb); + total_compile_ms += cms; + if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { + printf("Initial compile failed!\n"); return 1; + } + printf("Initial compile: %.0fms\n", cms); + } + + int step = start_step; + while (step < total_steps) { + // Check compile budget + if (g_compile_count + KERNELS_PER_STEP > MAX_COMPILES) { + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2, + cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms, + cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb), + cum_steps + total_steps_done, cum_batches + total_batches); + double wall = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb); + printf("[exec() restart at step %d, %d compiles, loss=%.6f, wall=%.0fms]\n", + step, g_compile_count, last_loss, wall); + fflush(stdout); + execl(argv[0], argv[0], "--resume", NULL); + perror("execl failed"); return 1; + } + + // === Run ACCUM_STEPS with current kernels === + float *aW1 = (float*)calloc(H * D, sizeof(float)); + float *aW2 = (float*)calloc(D * H, sizeof(float)); + int steps_this_batch = 0; + + // Pipeline: start compiling NEXT batch's kernels in background + // We'll apply gradients first, then launch compile with updated W + // But for pipeline, we compile AHEAD: while running batch N, compile for N+1 + // So we need to update weights BEFORE launching background compile + + uint64_t t_batch = mach_absolute_time(); + for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { + ane_eval_k(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval_k(k2_fwd, h_relu, y, H, D, S); + + float loss = 0; + for (int i = 0; i < S*D; i++) { + float diff = y[i] - y_target[i]; + loss += diff * diff; + dy[i] = 2.0f * diff / (S * D); + } + loss /= (S * D); + last_loss = loss; + + ane_eval_k(k2_bwd, dy, dh_relu, D, H, S); + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + ane_eval_k(k1_bwd, dh, dx_layer, H, D, S); + + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) + for (int j = 0; j < H; j++) + aW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) + for (int j = 0; j < D; j++) + aW1[i*D + j] += dh[t*H + i] * x[t*D + j]; + + steps_this_batch++; + } + double batch_ms = tb_to_ms(mach_absolute_time() - t_batch, g_tb); + total_train_ms += batch_ms; + + // Apply accumulated gradients + float scale = 1.0f / steps_this_batch; + for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1[i] * scale; + for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2[i] * scale; + free(aW1); free(aW2); + + total_steps_done += steps_this_batch; + total_batches++; + + // Print progress + double step_ms = batch_ms / steps_this_batch; + double ane_gflops = (ane_flops_per_step * steps_this_batch) / (batch_ms * 1e6); + double total_gflops = (total_flops_per_step * steps_this_batch) / (batch_ms * 1e6); + + if (total_batches % 5 == 1 || total_batches <= 2 || step >= total_steps) { + printf("step %-5d loss=%-10.6f %5.1fms/step ANE=%.2f GFLOPS total=%.2f GFLOPS compiles=%d\n", + step - steps_this_batch, last_loss, step_ms, ane_gflops, total_gflops, g_compile_count); + } + + // Pipeline: launch background compile with updated weights, + // then immediately start NEXT batch's ANE evals with OLD kernels + // while compile runs concurrently on GCD queue + bool can_pipeline = (step < total_steps) && (g_compile_count + KERNELS_PER_STEP <= MAX_COMPILES); + + if (can_pipeline) { + // Snapshot weights for background compile + PipelineCompile *pc = calloc(1, sizeof(PipelineCompile)); + pc->W1 = (float*)malloc(H * D * sizeof(float)); + pc->W2 = (float*)malloc(D * H * sizeof(float)); + memcpy(pc->W1, W1, H * D * sizeof(float)); + memcpy(pc->W2, W2, D * H * sizeof(float)); + pc->D = D; pc->H = H; pc->S = S; + + dispatch_semaphore_t sem = dispatch_semaphore_create(0); + + dispatch_async(g_compile_queue, ^{ + @autoreleasepool { + uint64_t t0 = mach_absolute_time(); + pc->k1_fwd = compile_kern_with_blob(build_blob(pc->W1, pc->H, pc->D), pc->D, pc->H, pc->S); + pc->k2_fwd = compile_kern_with_blob(build_blob(pc->W2, pc->D, pc->H), pc->H, pc->D, pc->S); + pc->k2_bwd = compile_kern_with_blob(build_blob_transposed(pc->W2, pc->D, pc->H), pc->D, pc->H, pc->S); + pc->k1_bwd = compile_kern_with_blob(build_blob_transposed(pc->W1, pc->H, pc->D), pc->H, pc->D, pc->S); + pc->compile_ms = tb_to_ms(mach_absolute_time() - t0, g_tb); + pc->ok = pc->k1_fwd && pc->k2_fwd && pc->k1_bwd && pc->k2_bwd; + dispatch_semaphore_signal(sem); + } + }); + + // === While compile runs in background, do ANOTHER batch with OLD kernels === + if (step < total_steps && k1_fwd && k2_fwd && k1_bwd && k2_bwd) { + float *aW1b = (float*)calloc(H * D, sizeof(float)); + float *aW2b = (float*)calloc(D * H, sizeof(float)); + int steps_overlap = 0; + uint64_t t_overlap = mach_absolute_time(); + + for (int a = 0; a < ACCUM_STEPS && step < total_steps; a++, step++) { + ane_eval_k(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval_k(k2_fwd, h_relu, y, H, D, S); + + float loss = 0; + for (int i = 0; i < S*D; i++) { + float diff = y[i] - y_target[i]; + loss += diff * diff; + dy[i] = 2.0f * diff / (S * D); + } + loss /= (S * D); + last_loss = loss; + + ane_eval_k(k2_bwd, dy, dh_relu, D, H, S); + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + ane_eval_k(k1_bwd, dh, dx_layer, H, D, S); + + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) + for (int j = 0; j < H; j++) + aW2b[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) + for (int j = 0; j < D; j++) + aW1b[i*D + j] += dh[t*H + i] * x[t*D + j]; + steps_overlap++; + } + double overlap_ms = tb_to_ms(mach_absolute_time() - t_overlap, g_tb); + total_train_ms += overlap_ms; + total_steps_done += steps_overlap; + total_batches++; + + // Apply these gradients with reduced LR (stale weights — 1 batch behind) + float sc = 0.5f / steps_overlap; // half LR for stale batch + for (int i = 0; i < H*D; i++) W1[i] -= lr * aW1b[i] * sc; + for (int i = 0; i < D*H; i++) W2[i] -= lr * aW2b[i] * sc; + free(aW1b); free(aW2b); + + if (total_batches % 5 == 1) { + double sm = overlap_ms / steps_overlap; + printf("step %-5d loss=%-10.6f %5.1fms/step (overlapped with compile) compiles=%d\n", + step - steps_overlap, last_loss, sm, g_compile_count); + } + } + + // Wait for compile to finish + dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER); + total_compile_ms += pc->compile_ms; + total_hidden_compile_ms += pc->compile_ms; // all hidden behind train + + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + + if (pc->ok) { + k1_fwd = pc->k1_fwd; k2_fwd = pc->k2_fwd; + k1_bwd = pc->k1_bwd; k2_bwd = pc->k2_bwd; + } else { + k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; + } + free(pc->W1); free(pc->W2); free(pc); + } else if (step < total_steps) { + // Synchronous compile (no budget for pipeline) + uint64_t t0 = mach_absolute_time(); + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); + k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); + k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); + k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); + double cms = tb_to_ms(mach_absolute_time() - t0, g_tb); + total_compile_ms += cms; + if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { + save_checkpoint(CKPT_PATH, step, last_loss, D, H, S, total_steps, lr, W1, W2, + cum_compile_ms + total_compile_ms, cum_train_ms + total_train_ms, + cum_wall_ms + tb_to_ms(mach_absolute_time() - t_wall_start, g_tb), + cum_steps + total_steps_done, cum_batches + total_batches); + fflush(stdout); + execl(argv[0], argv[0], "--resume", NULL); + perror("execl failed"); return 1; + } + } + + if (last_loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; } + } + + total_wall_ms = tb_to_ms(mach_absolute_time() - t_wall_start, g_tb); + // Add cumulative from previous exec() runs + total_compile_ms += cum_compile_ms; + total_train_ms += cum_train_ms; + total_wall_ms += cum_wall_ms; + total_steps_done += cum_steps; + total_batches += cum_batches; + + // === Final output === + printf("\nFinal output vs target (first 8):\n"); + if (k1_fwd && k2_fwd) { + ane_eval_k(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval_k(k2_fwd, h_relu, y, H, D, S); + } + printf(" y: "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n"); + printf(" target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n"); + + // === Efficiency Report === + printf("\n=== Efficiency Report ===\n"); + printf("Total steps: %d\n", total_steps_done); + printf("Total batches: %d (accum %d steps each)\n", total_batches, ACCUM_STEPS); + printf("Wall time: %.0f ms\n", total_wall_ms); + printf("Compile time: %.0f ms (%.1f%%)\n", total_compile_ms, 100.0*total_compile_ms/total_wall_ms); + printf("Train time: %.0f ms (%.1f%%)\n", total_train_ms, 100.0*total_train_ms/total_wall_ms); + printf("Overhead: %.0f ms (%.1f%%)\n", + total_wall_ms - total_compile_ms - total_train_ms, + 100.0*(total_wall_ms - total_compile_ms - total_train_ms)/total_wall_ms); + printf("\n"); + printf("Avg compile: %.1f ms per batch (4 kernels)\n", total_compile_ms / total_batches); + printf("Avg train: %.2f ms per step (ANE fwd+bwd + CPU dW)\n", total_train_ms / total_steps_done); + printf("Avg wall/step: %.2f ms\n", total_wall_ms / total_steps_done); + printf("\n"); + double ane_total_flops = ane_flops_per_step * total_steps_done; + double cpu_total_flops = cpu_flops_per_step * total_steps_done; + printf("ANE FLOPs total: %.3f MFLOP (%.2f GFLOPS sustained)\n", + ane_total_flops / 1e6, ane_total_flops / (total_train_ms * 1e6)); + printf("CPU FLOPs total: %.3f MFLOP (%.2f GFLOPS sustained)\n", + cpu_total_flops / 1e6, cpu_total_flops / (total_train_ms * 1e6)); + printf("Total FLOPs: %.3f MFLOP (%.2f GFLOPS sustained)\n", + (ane_total_flops + cpu_total_flops) / 1e6, + (ane_total_flops + cpu_total_flops) / (total_train_ms * 1e6)); + printf("\n"); + printf("ANE utilization: %.4f%% of %.1f TFLOPS peak\n", + 100.0 * ane_total_flops / (total_train_ms * 1e6) / (ane_peak_tflops() * 1000.0), ane_peak_tflops()); + printf("Weight params: %d (%.1f KB FP16)\n", + H*D + D*H, weight_bytes / 1024.0); + printf("Compile amortization: %.1f ms compile / %d steps = %.2f ms/step overhead\n", + total_compile_ms / total_batches, ACCUM_STEPS, + total_compile_ms / total_batches / ACCUM_STEPS); + printf("Compile fraction: %.1f%% of wall time\n", 100.0 * total_compile_ms / total_wall_ms); + printf("Train fraction: %.1f%% of wall time (useful work)\n", 100.0 * total_train_ms / total_wall_ms); + + free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); + free(W1); free(W2); free(x); free(y_target); + free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); + unlink(CKPT_PATH); + } + return 0; +} diff --git a/training/tiny_train_old.m b/training/tiny_train_old.m index c22a90c..54e9ce5 100644 --- a/training/tiny_train_old.m +++ b/training/tiny_train_old.m @@ -1,309 +1,313 @@ -// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward) -// y = W2 @ relu(W1 @ x), MSE loss, SGD update -// Forward: ANE conv with baked weights -// Backward dx: ANE conv with transposed baked weights -// Backward dW: CPU (outer product, memory-bound) -#import -#import -#import -#import -#import -#import -#include - -static Class g_D, g_I, g_AR, g_AIO; - -static void ane_init(void) { - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); - g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); - g_I = NSClassFromString(@"_ANEInMemoryModel"); - g_AR = NSClassFromString(@"_ANERequest"); - g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); -} - -static IOSurfaceRef make_surface(size_t bytes) { - return IOSurfaceCreate((__bridge CFDictionaryRef)@{ - (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, - (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), - (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); -} - -static NSData *build_blob(const float *w, int rows, int cols) { - int wsize = rows * cols * 2; - int total = 128 + wsize; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0] = 0x01; buf[4] = 0x02; - buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; - buf[68] = 0x01; - *(uint32_t*)(buf+72) = wsize; - *(uint32_t*)(buf+80) = 128; - _Float16 *fp16 = (_Float16*)(buf + 128); - for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i]; - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -// Build blob with TRANSPOSED weights: W[rows,cols] → W^T[cols,rows] -static NSData *build_blob_transposed(const float *w, int rows, int cols) { - int wsize = cols * rows * 2; - int total = 128 + wsize; - uint8_t *buf = (uint8_t*)calloc(total, 1); - buf[0] = 0x01; buf[4] = 0x02; - buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; - buf[68] = 0x01; - *(uint32_t*)(buf+72) = wsize; - *(uint32_t*)(buf+80) = 128; - _Float16 *fp16 = (_Float16*)(buf + 128); - for (int i = 0; i < rows; i++) - for (int j = 0; j < cols; j++) - fp16[j * rows + i] = (_Float16)w[i * cols + j]; // transpose - return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; -} - -static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { - return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" - " } -> (y);\n}\n", - in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; -} - -typedef struct { - id model; - IOSurfaceRef ioIn, ioOut; - id request; - NSString *tmpDir; -} Kern; - -static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { - NSString *mil = gen_conv_mil(in_ch, out_ch, sp); - NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; - NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}}; - id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil); - if (!desc) return NULL; - id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); - id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); - NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; - [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; - [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; - [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; - size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; - IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); - id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); - id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); - id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, - @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), - @[wI], @[@0], @[wO], @[@0], nil, nil, @0); - Kern *k = calloc(1, sizeof(Kern)); - k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td; - return k; -} - -static void free_kern(Kern *k) { - if (!k) return; - NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); - CFRelease(k->ioIn); CFRelease(k->ioOut); - [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil]; - free(k); -} - -// ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first -static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { - float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); - for (int t = 0; t < sp; t++) - for (int c = 0; c < in_ch; c++) - tmp[c*sp + t] = in[t*in_ch + c]; - IOSurfaceLock(k->ioIn, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); - IOSurfaceUnlock(k->ioIn, 0, NULL); - free(tmp); - NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e); - float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); - IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); - IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - for (int t = 0; t < sp; t++) - for (int c = 0; c < out_ch; c++) - out[t*out_ch + c] = tmp2[c*sp + t]; - free(tmp2); -} - -int main(int argc, char *argv[]) { - @autoreleasepool { - ane_init(); - mach_timebase_info_data_t tb; - mach_timebase_info(&tb); - - int D = 64, H = 128, S = 16; - int steps = 25; // 4 kernels × 25 = 100 compiles, under 119 limit - float lr = 0.5f; - int recompile_every = 1; // recompile every step for correct gradients - - float *W1 = (float*)malloc(H * D * sizeof(float)); - float *W2 = (float*)malloc(D * H * sizeof(float)); - for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f); - for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); - - float *x = (float*)calloc(S * D, sizeof(float)); - float *y_target = (float*)calloc(S * D, sizeof(float)); - for (int t = 0; t < S; t++) - for (int i = 0; i < D; i++) { - float v = sinf((t * D + i) * 0.1f); - x[t*D + i] = v; - y_target[t*D + i] = v; - } - - printf("=== Tiny 2-Layer ANE Training (Forward + Backward on ANE) ===\n"); - printf("x:[%d,%d] → W1:[%d,%d] → ReLU → W2:[%d,%d] → y:[%d,%d]\n", S,D, H,D, D,H, S,D); - printf("Forward: ANE conv | Backward dx: ANE conv(W^T) | Backward dW: CPU\n"); - printf("Steps: %d, LR: %.4f, Recompile every %d steps\n\n", steps, lr, recompile_every); - - float *h = (float*)malloc(S * H * sizeof(float)); - float *h_relu = (float*)malloc(S * H * sizeof(float)); - float *y = (float*)malloc(S * D * sizeof(float)); - float *dy = (float*)malloc(S * D * sizeof(float)); - float *dh_relu = (float*)malloc(S * H * sizeof(float)); - float *dh = (float*)malloc(S * H * sizeof(float)); - float *dx_layer = (float*)malloc(S * D * sizeof(float)); // not used for update but proves backward works - float *dW1 = (float*)calloc(H * D, sizeof(float)); - float *dW2 = (float*)calloc(D * H, sizeof(float)); - - // 4 ANE kernels: 2 forward + 2 backward (transposed weights) - Kern *k1_fwd = NULL, *k2_fwd = NULL; // W1: [H,D]→conv(D→H), W2: [D,H]→conv(H→D) - Kern *k1_bwd = NULL, *k2_bwd = NULL; // W1^T: [D,H]→conv(H→D), W2^T: [H,D]→conv(D→H) - bool on_ane = true; - - printf("%-6s %-12s %-10s %-6s\n", "Step", "MSE Loss", "ms/step", "Backend"); - printf("--------------------------------------\n"); - - for (int step = 0; step < steps; step++) { - uint64_t t0 = mach_absolute_time(); - - if (on_ane && step % recompile_every == 0) { - free_kern(k1_fwd); free_kern(k2_fwd); - free_kern(k1_bwd); free_kern(k2_bwd); - k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; - @autoreleasepool { - k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); - k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); - // Backward: dx = W^T @ dy → conv with transposed weight - // W2^T: [H,D] as conv weight, input dy [1,D,1,S] → output dh [1,H,1,S] - k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); - // W1^T: [D,H] as conv weight, input dh [1,H,1,S] → output dx [1,D,1,S] - k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); - } - if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { - printf("ANE limit at step %d, continuing on CPU\n", step); - free_kern(k1_fwd); free_kern(k2_fwd); - free_kern(k1_bwd); free_kern(k2_bwd); - k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; - on_ane = false; - } - } - - if (on_ane) { - // === Forward on ANE === - ane_eval(k1_fwd, x, h, D, H, S); - for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; - ane_eval(k2_fwd, h_relu, y, H, D, S); - } else { - for (int t = 0; t < S; t++) - for (int i = 0; i < H; i++) { - float s = 0; for (int j = 0; j < D; j++) s += W1[i*D+j] * x[t*D+j]; - h[t*H+i] = s; - } - for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; - for (int t = 0; t < S; t++) - for (int i = 0; i < D; i++) { - float s = 0; for (int j = 0; j < H; j++) s += W2[i*H+j] * h_relu[t*H+j]; - y[t*D+i] = s; - } - } - - // MSE loss + dL/dy - float loss = 0; - for (int i = 0; i < S*D; i++) { - float diff = y[i] - y_target[i]; - loss += diff * diff; - dy[i] = 2.0f * diff / (S * D); - } - loss /= (S * D); - - if (on_ane) { - // === Backward dx on ANE === - // dh_relu = W2^T @ dy (ANE conv with transposed W2) - ane_eval(k2_bwd, dy, dh_relu, D, H, S); - // ReLU backward (CPU, element-wise) - for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; - // dx = W1^T @ dh (ANE conv with transposed W1) - ane_eval(k1_bwd, dh, dx_layer, H, D, S); - } else { - memset(dh_relu, 0, S * H * sizeof(float)); - for (int t = 0; t < S; t++) - for (int j = 0; j < H; j++) - for (int i = 0; i < D; i++) - dh_relu[t*H + j] += W2[i*H + j] * dy[t*D + i]; - for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; - } - - // dW on CPU (outer products — memory-bound, not worth ANE) - memset(dW2, 0, D * H * sizeof(float)); - for (int t = 0; t < S; t++) - for (int i = 0; i < D; i++) - for (int j = 0; j < H; j++) - dW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; - memset(dW1, 0, H * D * sizeof(float)); - for (int t = 0; t < S; t++) - for (int i = 0; i < H; i++) - for (int j = 0; j < D; j++) - dW1[i*D + j] += dh[t*H + i] * x[t*D + j]; - - // SGD - for (int i = 0; i < H*D; i++) W1[i] -= lr * dW1[i]; - for (int i = 0; i < D*H; i++) W2[i] -= lr * dW2[i]; - - double ms = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; - - if (step % 1 == 0 || step == steps - 1) - printf("%-6d %-12.6f %-10.1f %-6s\n", step, loss, ms, on_ane ? "ANE" : "CPU"); - - if (loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; } - } - - printf("\nFinal output vs target (first 8):\n"); - if (on_ane && k1_fwd && k2_fwd) { - ane_eval(k1_fwd, x, h, D, H, S); - for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; - ane_eval(k2_fwd, h_relu, y, H, D, S); - } - printf(" y: "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n"); - printf(" target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n"); - - free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); - free(W1); free(W2); free(x); free(y_target); - free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); free(dW1); free(dW2); - printf("\nDone.\n"); - } - return 0; -} +// tiny_train.m — Train a 2-layer linear model on ANE (forward AND backward) +// y = W2 @ relu(W1 @ x), MSE loss, SGD update +// Forward: ANE conv with baked weights +// Backward dx: ANE conv with transposed baked weights +// Backward dW: CPU (outer product, memory-bound) +#import +#import +#import +#import +#import +#import +#include +#include "ane_compat.h" + +static Class g_D, g_I, g_AR, g_AIO; + +static void ane_init(void) { + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_D = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + g_I = NSClassFromString(@"_ANEInMemoryModel"); + g_AR = NSClassFromString(@"_ANERequest"); + g_AIO= NSClassFromString(@"_ANEIOSurfaceObject"); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +static NSData *build_blob(const float *w, int rows, int cols) { + int wsize = rows * cols * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows * cols; i++) fp16[i] = (_Float16)w[i]; + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +// Build blob with TRANSPOSED weights: W[rows,cols] → W^T[cols,rows] +static NSData *build_blob_transposed(const float *w, int rows, int cols) { + int wsize = cols * rows * 2; + int total = 128 + wsize; + uint8_t *buf = (uint8_t*)calloc(total, 1); + buf[0] = 0x01; buf[4] = 0x02; + buf[64] = 0xEF; buf[65] = 0xBE; buf[66] = 0xAD; buf[67] = 0xDE; + buf[68] = 0x01; + *(uint32_t*)(buf+72) = wsize; + *(uint32_t*)(buf+80) = 128; + _Float16 *fp16 = (_Float16*)(buf + 128); + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + fp16[j * rows + i] = (_Float16)w[i * cols + j]; // transpose + return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; +} + +static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + return [NSString stringWithFormat: + @"program(%s)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"\"}, " + "{\"coremlc-version\", \"\"}, {\"coremltools-component-milinternal\", \"\"}, " + "{\"coremltools-version\", \"\"}})]\n{\n" + " func main<%s>(tensor x) {\n" + " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" + " tensor W = const()[name = string(\"W\"), " + "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" + " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" + " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" + " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + " } -> (y);\n}\n", + g_ane_platform.mil_program, ane_mil_target(), + in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; +} + +typedef struct { + id model; + IOSurfaceRef ioIn, ioOut; + id request; + NSString *tmpDir; +} Kern; + +static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { + NSString *mil = gen_conv_mil(in_ch, out_ch, sp); + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + NSDictionary *wd = @{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":blob}}; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), milData, wd, nil); + if (!desc) return NULL; + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; + [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + NSError *e = nil; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; + size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); + id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(g_AR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + Kern *k = calloc(1, sizeof(Kern)); + k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td; + return k; +} + +static void free_kern(Kern *k) { + if (!k) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + CFRelease(k->ioIn); CFRelease(k->ioOut); + [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil]; + free(k); +} + +// ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first +static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { + float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + tmp[c*sp + t] = in[t*in_ch + c]; + IOSurfaceLock(k->ioIn, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioIn, 0, NULL); + free(tmp); + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e); + float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = tmp2[c*sp + t]; + free(tmp2); +} + +int main(int argc, char *argv[]) { + @autoreleasepool { + ane_init(); + ane_detect_platform(); + ane_print_platform(); + mach_timebase_info_data_t tb; + mach_timebase_info(&tb); + + int D = 64, H = 128, S = 16; + int steps = 25; // 4 kernels × 25 = 100 compiles, under 119 limit + float lr = 0.5f; + int recompile_every = 1; // recompile every step for correct gradients + + float *W1 = (float*)malloc(H * D * sizeof(float)); + float *W2 = (float*)malloc(D * H * sizeof(float)); + for (int i = 0; i < H*D; i++) W1[i] = 0.01f * sinf(i * 1.3f + 0.7f); + for (int i = 0; i < D*H; i++) W2[i] = 0.01f * cosf(i * 0.9f + 1.1f); + + float *x = (float*)calloc(S * D, sizeof(float)); + float *y_target = (float*)calloc(S * D, sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) { + float v = sinf((t * D + i) * 0.1f); + x[t*D + i] = v; + y_target[t*D + i] = v; + } + + printf("=== Tiny 2-Layer ANE Training (Forward + Backward on ANE) ===\n"); + printf("x:[%d,%d] → W1:[%d,%d] → ReLU → W2:[%d,%d] → y:[%d,%d]\n", S,D, H,D, D,H, S,D); + printf("Forward: ANE conv | Backward dx: ANE conv(W^T) | Backward dW: CPU\n"); + printf("Steps: %d, LR: %.4f, Recompile every %d steps\n\n", steps, lr, recompile_every); + + float *h = (float*)malloc(S * H * sizeof(float)); + float *h_relu = (float*)malloc(S * H * sizeof(float)); + float *y = (float*)malloc(S * D * sizeof(float)); + float *dy = (float*)malloc(S * D * sizeof(float)); + float *dh_relu = (float*)malloc(S * H * sizeof(float)); + float *dh = (float*)malloc(S * H * sizeof(float)); + float *dx_layer = (float*)malloc(S * D * sizeof(float)); // not used for update but proves backward works + float *dW1 = (float*)calloc(H * D, sizeof(float)); + float *dW2 = (float*)calloc(D * H, sizeof(float)); + + // 4 ANE kernels: 2 forward + 2 backward (transposed weights) + Kern *k1_fwd = NULL, *k2_fwd = NULL; // W1: [H,D]→conv(D→H), W2: [D,H]→conv(H→D) + Kern *k1_bwd = NULL, *k2_bwd = NULL; // W1^T: [D,H]→conv(H→D), W2^T: [H,D]→conv(D→H) + bool on_ane = true; + + printf("%-6s %-12s %-10s %-6s\n", "Step", "MSE Loss", "ms/step", "Backend"); + printf("--------------------------------------\n"); + + for (int step = 0; step < steps; step++) { + uint64_t t0 = mach_absolute_time(); + + if (on_ane && step % recompile_every == 0) { + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; + @autoreleasepool { + k1_fwd = compile_kern_with_blob(build_blob(W1, H, D), D, H, S); + k2_fwd = compile_kern_with_blob(build_blob(W2, D, H), H, D, S); + // Backward: dx = W^T @ dy → conv with transposed weight + // W2^T: [H,D] as conv weight, input dy [1,D,1,S] → output dh [1,H,1,S] + k2_bwd = compile_kern_with_blob(build_blob_transposed(W2, D, H), D, H, S); + // W1^T: [D,H] as conv weight, input dh [1,H,1,S] → output dx [1,D,1,S] + k1_bwd = compile_kern_with_blob(build_blob_transposed(W1, H, D), H, D, S); + } + if (!k1_fwd || !k2_fwd || !k1_bwd || !k2_bwd) { + printf("ANE limit at step %d, continuing on CPU\n", step); + free_kern(k1_fwd); free_kern(k2_fwd); + free_kern(k1_bwd); free_kern(k2_bwd); + k1_fwd = k2_fwd = k1_bwd = k2_bwd = NULL; + on_ane = false; + } + } + + if (on_ane) { + // === Forward on ANE === + ane_eval(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval(k2_fwd, h_relu, y, H, D, S); + } else { + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) { + float s = 0; for (int j = 0; j < D; j++) s += W1[i*D+j] * x[t*D+j]; + h[t*H+i] = s; + } + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) { + float s = 0; for (int j = 0; j < H; j++) s += W2[i*H+j] * h_relu[t*H+j]; + y[t*D+i] = s; + } + } + + // MSE loss + dL/dy + float loss = 0; + for (int i = 0; i < S*D; i++) { + float diff = y[i] - y_target[i]; + loss += diff * diff; + dy[i] = 2.0f * diff / (S * D); + } + loss /= (S * D); + + if (on_ane) { + // === Backward dx on ANE === + // dh_relu = W2^T @ dy (ANE conv with transposed W2) + ane_eval(k2_bwd, dy, dh_relu, D, H, S); + // ReLU backward (CPU, element-wise) + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + // dx = W1^T @ dh (ANE conv with transposed W1) + ane_eval(k1_bwd, dh, dx_layer, H, D, S); + } else { + memset(dh_relu, 0, S * H * sizeof(float)); + for (int t = 0; t < S; t++) + for (int j = 0; j < H; j++) + for (int i = 0; i < D; i++) + dh_relu[t*H + j] += W2[i*H + j] * dy[t*D + i]; + for (int i = 0; i < S*H; i++) dh[i] = h[i] > 0 ? dh_relu[i] : 0; + } + + // dW on CPU (outer products — memory-bound, not worth ANE) + memset(dW2, 0, D * H * sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < D; i++) + for (int j = 0; j < H; j++) + dW2[i*H + j] += dy[t*D + i] * h_relu[t*H + j]; + memset(dW1, 0, H * D * sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < H; i++) + for (int j = 0; j < D; j++) + dW1[i*D + j] += dh[t*H + i] * x[t*D + j]; + + // SGD + for (int i = 0; i < H*D; i++) W1[i] -= lr * dW1[i]; + for (int i = 0; i < D*H; i++) W2[i] -= lr * dW2[i]; + + double ms = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; + + if (step % 1 == 0 || step == steps - 1) + printf("%-6d %-12.6f %-10.1f %-6s\n", step, loss, ms, on_ane ? "ANE" : "CPU"); + + if (loss < 1e-6f) { printf("\nConverged at step %d!\n", step); break; } + } + + printf("\nFinal output vs target (first 8):\n"); + if (on_ane && k1_fwd && k2_fwd) { + ane_eval(k1_fwd, x, h, D, H, S); + for (int i = 0; i < S*H; i++) h_relu[i] = h[i] > 0 ? h[i] : 0; + ane_eval(k2_fwd, h_relu, y, H, D, S); + } + printf(" y: "); for (int i = 0; i < 8; i++) printf("%.4f ", y[i]); printf("\n"); + printf(" target: "); for (int i = 0; i < 8; i++) printf("%.4f ", y_target[i]); printf("\n"); + + free_kern(k1_fwd); free_kern(k2_fwd); free_kern(k1_bwd); free_kern(k2_bwd); + free(W1); free(W2); free(x); free(y_target); + free(h); free(h_relu); free(y); free(dy); free(dh_relu); free(dh); free(dx_layer); free(dW1); free(dW2); + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/train_large.m b/training/train_large.m index e58ce08..c807352 100644 --- a/training/train_large.m +++ b/training/train_large.m @@ -1,687 +1,690 @@ -// train_large.m — Train stories110M (12 layers, 768dim, 3072hidden) on ANE -// Uses pretokenized TinyStories data with cross-entropy loss -// 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch -#include "stories_io.h" -#include "stories_mil.h" -#include "stories_cpu_ops.h" - -#define CKPT_PATH "ane_stories110M_ckpt.bin" -#define MODEL_PATH "../../assets/models/stories110M.bin" -#define DATA_PATH "tinystories_data00.bin" - -// ===== Weight loading from llama2.c format ===== -static bool load_pretrained(LayerWeights *lw, float *rms_final, float *embed, const char *path) { - FILE *f = fopen(path, "rb"); - if (!f) { printf("Cannot open %s\n", path); return false; } - Llama2Config cfg; - fread(&cfg, sizeof(cfg), 1, f); - printf(" Model config: dim=%d hidden=%d layers=%d heads=%d vocab=%d seq=%d\n", - cfg.dim, cfg.hidden_dim, cfg.n_layers, cfg.n_heads, abs(cfg.vocab_size), cfg.seq_len); - if (cfg.dim != DIM || cfg.hidden_dim != HIDDEN || cfg.n_layers != NLAYERS) { - printf(" ERROR: Config mismatch! Expected dim=%d hidden=%d layers=%d\n", DIM, HIDDEN, NLAYERS); - fclose(f); return false; - } - int V = abs(cfg.vocab_size); - bool shared = cfg.vocab_size > 0; - - // Read in llama2.c order: embed, rms_att[all], wq[all], wk[all], wv[all], wo[all], - // rms_ffn[all], w1[all], w2[all], w3[all], rms_final, [wcls] - fread(embed, 4, V * DIM, f); - - // rms_att weights for all layers (contiguous) - for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_att, 4, DIM, f); - // wq for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wq, 4, WQ_SZ, f); - // wk for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wk, 4, WQ_SZ, f); - // wv for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wv, 4, WQ_SZ, f); - // wo for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].Wo, 4, WO_SZ, f); - // rms_ffn weights for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].rms_ffn, 4, DIM, f); - // w1 for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].W1, 4, W1_SZ, f); - // w2 for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].W2, 4, W2_SZ, f); - // w3 for all layers - for (int L = 0; L < NLAYERS; L++) fread(lw[L].W3, 4, W3_SZ, f); - // rms_final - fread(rms_final, 4, DIM, f); - // wcls = embed if shared (we just use embed pointer) - - fclose(f); - printf(" Loaded pretrained weights (%s)\n", shared ? "shared embed/cls" : "separate cls"); - return true; -} - -// ===== Compile one layer's kernels ===== -static bool compile_layer_kernels(LayerKernels *lk, LayerWeights *w) { - lk->fwdAttn = compile_kern_mil_w(gen_sdpa_fwd_taps(), (@{ - @"@model_path/weights/rms1.bin": @{@"offset":@0, @"data":build_blob(w->rms_att,1,DIM)}, - @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(w->Wq,DIM,DIM)}, - @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(w->Wk,DIM,DIM)}, - @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(w->Wv,DIM,DIM)}, - @"@model_path/weights/wo.bin": @{@"offset":@0, @"data":build_blob(w->Wo,DIM,DIM)}, - @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, - }), DIM*SEQ*2, 6*DIM*SEQ*2); - - lk->fwdFFN = compile_kern_mil_w(gen_ffn_fwd_taps(), (@{ - @"@model_path/weights/rms2.bin": @{@"offset":@0, @"data":build_blob(w->rms_ffn,1,DIM)}, - @"@model_path/weights/w1.bin": @{@"offset":@0, @"data":build_blob(w->W1,HIDDEN,DIM)}, - @"@model_path/weights/w3.bin": @{@"offset":@0, @"data":build_blob(w->W3,HIDDEN,DIM)}, - @"@model_path/weights/w2.bin": @{@"offset":@0, @"data":build_blob(w->W2,DIM,HIDDEN)}, - }), DIM*SEQ*2, (2*DIM+3*HIDDEN)*SEQ*2); - - lk->ffnBwd = compile_kern_mil_w(gen_ffn_bwd(), (@{ - @"@model_path/weights/w2t.bin": @{@"offset":@0, @"data":build_blob_t(w->W2,DIM,HIDDEN)}, - @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(w->W1,HIDDEN,DIM)}, - @"@model_path/weights/w3t.bin": @{@"offset":@0, @"data":build_blob_t(w->W3,HIDDEN,DIM)}, - }), (DIM+2*HIDDEN)*SEQ*2, (DIM+2*HIDDEN)*SEQ*2); - - lk->sdpaBwd1 = compile_kern_mil_w(gen_sdpa_bwd1(), (@{ - @"@model_path/weights/mask.bin": @{@"offset":@0, @"data":get_mask_blob()}, - @"@model_path/weights/wot.bin": @{@"offset":@0, @"data":build_blob_t(w->Wo,DIM,DIM)}, - }), 4*DIM*SEQ*2, (DIM+2*SCORE_CH)*SEQ*2); - - lk->qkvBwd = compile_kern_mil_w(gen_qkvb(), (@{ - @"@model_path/weights/wqt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wq,DIM,DIM)}, - @"@model_path/weights/wkt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wk,DIM,DIM)}, - @"@model_path/weights/wvt.bin": @{@"offset":@0, @"data":build_blob_t(w->Wv,DIM,DIM)}, - }), 3*DIM*SEQ*2, DIM*SEQ*2); - - return lk->fwdAttn && lk->fwdFFN && lk->ffnBwd && lk->sdpaBwd1 && lk->qkvBwd; -} - -// Compile weight-free sdpaBwd2 (only needs once, no weights) -static Kern *compile_sdpa_bwd2(void) { - return compile_kern_mil_w(gen_sdpa_bwd2(), @{}, - (2*SCORE_CH+2*DIM)*SEQ*2, 2*DIM*SEQ*2); -} - -static void free_layer_kernels(LayerKernels *lk) { - free_kern(lk->fwdAttn); free_kern(lk->fwdFFN); free_kern(lk->ffnBwd); - free_kern(lk->sdpaBwd1); free_kern(lk->qkvBwd); - // sdpaBwd2 is shared, freed separately - lk->fwdAttn = lk->fwdFFN = lk->ffnBwd = lk->sdpaBwd1 = lk->qkvBwd = NULL; -} - -// ===== Checkpoint save/load ===== -static void save_checkpoint(const char *path, int step, int total_steps, float lr, float loss, - double cc, double ct, double cw, int cs, int cb, int adam_t, - LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, - float *embed, AdamState *aembed) { - FILE *f = fopen(path, "wb"); - CkptHdr h = {0}; - h.magic = 0x424C5A54; h.version = 2; - h.step = step; h.total_steps = total_steps; - h.n_layers = NLAYERS; h.vocab_size = VOCAB; h.dim = DIM; - h.hidden_dim = HIDDEN; h.n_heads = HEADS; h.seq_len = SEQ; - h.lr = lr; h.loss = loss; - h.cum_compile = cc; h.cum_train = ct; h.cum_wall = cw; - h.cum_steps = cs; h.cum_batches = cb; h.adam_t = adam_t; - fwrite(&h, sizeof(h), 1, f); - // Per-layer weights + adam - for (int L = 0; L < NLAYERS; L++) { - fwrite(lw[L].Wq,4,WQ_SZ,f); fwrite(lw[L].Wk,4,WQ_SZ,f); - fwrite(lw[L].Wv,4,WQ_SZ,f); fwrite(lw[L].Wo,4,WO_SZ,f); - fwrite(lw[L].W1,4,W1_SZ,f); fwrite(lw[L].W2,4,W2_SZ,f); fwrite(lw[L].W3,4,W3_SZ,f); - fwrite(lw[L].rms_att,4,DIM,f); fwrite(lw[L].rms_ffn,4,DIM,f); - // Adam state - fwrite(la[L].Wq.m,4,WQ_SZ,f); fwrite(la[L].Wq.v,4,WQ_SZ,f); - fwrite(la[L].Wk.m,4,WQ_SZ,f); fwrite(la[L].Wk.v,4,WQ_SZ,f); - fwrite(la[L].Wv.m,4,WQ_SZ,f); fwrite(la[L].Wv.v,4,WQ_SZ,f); - fwrite(la[L].Wo.m,4,WO_SZ,f); fwrite(la[L].Wo.v,4,WO_SZ,f); - fwrite(la[L].W1.m,4,W1_SZ,f); fwrite(la[L].W1.v,4,W1_SZ,f); - fwrite(la[L].W2.m,4,W2_SZ,f); fwrite(la[L].W2.v,4,W2_SZ,f); - fwrite(la[L].W3.m,4,W3_SZ,f); fwrite(la[L].W3.v,4,W3_SZ,f); - fwrite(la[L].rms_att.m,4,DIM,f); fwrite(la[L].rms_att.v,4,DIM,f); - fwrite(la[L].rms_ffn.m,4,DIM,f); fwrite(la[L].rms_ffn.v,4,DIM,f); - } - fwrite(rms_final,4,DIM,f); - fwrite(arms_final->m,4,DIM,f); fwrite(arms_final->v,4,DIM,f); - fwrite(embed,4,VOCAB*DIM,f); - fwrite(aembed->m,4,VOCAB*DIM,f); fwrite(aembed->v,4,VOCAB*DIM,f); - fclose(f); -} - -static bool load_checkpoint(const char *path, int *step, int *total_steps, float *lr, float *loss, - double *cc, double *ct, double *cw, int *cs, int *cb, int *adam_t, - LayerWeights *lw, LayerAdam *la, float *rms_final, AdamState *arms_final, - float *embed, AdamState *aembed) { - FILE *f = fopen(path, "rb"); - if (!f) return false; - CkptHdr h; - fread(&h, sizeof(h), 1, f); - if (h.magic != 0x424C5A54 || h.version != 2) { fclose(f); return false; } - *step = h.step; *total_steps = h.total_steps; *lr = h.lr; *loss = h.loss; - *cc = h.cum_compile; *ct = h.cum_train; *cw = h.cum_wall; - *cs = h.cum_steps; *cb = h.cum_batches; *adam_t = h.adam_t; - for (int L = 0; L < NLAYERS; L++) { - fread(lw[L].Wq,4,WQ_SZ,f); fread(lw[L].Wk,4,WQ_SZ,f); - fread(lw[L].Wv,4,WQ_SZ,f); fread(lw[L].Wo,4,WO_SZ,f); - fread(lw[L].W1,4,W1_SZ,f); fread(lw[L].W2,4,W2_SZ,f); fread(lw[L].W3,4,W3_SZ,f); - fread(lw[L].rms_att,4,DIM,f); fread(lw[L].rms_ffn,4,DIM,f); - fread(la[L].Wq.m,4,WQ_SZ,f); fread(la[L].Wq.v,4,WQ_SZ,f); - fread(la[L].Wk.m,4,WQ_SZ,f); fread(la[L].Wk.v,4,WQ_SZ,f); - fread(la[L].Wv.m,4,WQ_SZ,f); fread(la[L].Wv.v,4,WQ_SZ,f); - fread(la[L].Wo.m,4,WO_SZ,f); fread(la[L].Wo.v,4,WO_SZ,f); - fread(la[L].W1.m,4,W1_SZ,f); fread(la[L].W1.v,4,W1_SZ,f); - fread(la[L].W2.m,4,W2_SZ,f); fread(la[L].W2.v,4,W2_SZ,f); - fread(la[L].W3.m,4,W3_SZ,f); fread(la[L].W3.v,4,W3_SZ,f); - fread(la[L].rms_att.m,4,DIM,f); fread(la[L].rms_att.v,4,DIM,f); - fread(la[L].rms_ffn.m,4,DIM,f); fread(la[L].rms_ffn.v,4,DIM,f); - } - fread(rms_final,4,DIM,f); - fread(arms_final->m,4,DIM,f); fread(arms_final->v,4,DIM,f); - fread(embed,4,VOCAB*DIM,f); - fread(aembed->m,4,VOCAB*DIM,f); fread(aembed->v,4,VOCAB*DIM,f); - fclose(f); - return true; -} - -// ===== Main ===== -int main(int argc, char *argv[]) { - @autoreleasepool { - setbuf(stdout, NULL); - ane_init(); - mach_timebase_info(&g_tb); - - int total_steps = 10000; - float lr = 3e-4f; - float adam_b1=0.9f, adam_b2=0.999f, adam_eps=1e-8f; - int adam_t = 0, start_step = 0; - - // Parse args - bool do_resume = false; - for (int i=1; i