-
Notifications
You must be signed in to change notification settings - Fork 0
fix: MIL syntax + M1/M2 backward compatibility #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| *.o | ||
| ane_probe | ||
| api_explore | ||
| inmem_basic | ||
| inmem_peak | ||
| tiny_train | ||
| tiny_train_m1 | ||
| train | ||
| train_large | ||
| test_weight_reload | ||
| test_perf_stats | ||
| test_qos_sweep | ||
| test_ane_advanced | ||
| test_ane_causal_attn | ||
| test_ane_sdpa5 | ||
| test_conv_attn3 | ||
| test_full_fused | ||
| test_fused_bwd | ||
| test_fused_qkv | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,7 @@ | |
|
|
||
| static mach_timebase_info_data_t g_tb; | ||
| static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } | ||
| static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly | ||
|
|
||
| NSData *buildWeightBlob(int ch, int depth) { | ||
| NSUInteger wsize = ch * ch * 2; | ||
|
|
@@ -27,28 +28,45 @@ | |
|
|
||
| NSString *genMIL(int ch, int sp, int depth) { | ||
| NSMutableString *m = [NSMutableString string]; | ||
| [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"]; | ||
| [m appendFormat:@" func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp]; | ||
| [m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n" | ||
| @" tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n" | ||
| @" tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n" | ||
| @" tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n" | ||
| @" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n" | ||
| @" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"]; | ||
| [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp]; | ||
| [m appendString:@"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"]; | ||
| if (g_fp16_io) { | ||
| // fp16 I/O path — no cast ops (M1/M2 compatible) | ||
| [m appendFormat:@" func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", ch, sp]; | ||
| } else { | ||
| // fp32 I/O path — cast to/from fp16 internally (M4+ native) | ||
| [m appendFormat:@" func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n", ch, sp]; | ||
| } | ||
| [m appendString: | ||
| @" tensor<string, []> c_pad_type_0 = const()[name = tensor<string, []>(\"c_pad_type_0\"), val = tensor<string, []>(\"valid\")];\n" | ||
| @" tensor<int32, [2]> c_strides_0 = const()[name = tensor<string, []>(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n" | ||
| @" tensor<int32, [4]> c_pad_0 = const()[name = tensor<string, []>(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n" | ||
| @" tensor<int32, [2]> c_dilations_0 = const()[name = tensor<string, []>(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n" | ||
| @" tensor<int32, []> c_groups_0 = const()[name = tensor<string, []>(\"c_groups_0\"), val = tensor<int32, []>(1)];\n"]; | ||
| NSString *prev; | ||
| if (g_fp16_io) { | ||
| prev = @"x"; | ||
| } else { | ||
| [m appendString:@" tensor<string, []> x_to_fp16_dtype_0 = const()[name = tensor<string, []>(\"x_to_fp16_dtype_0\"), val = tensor<string, []>(\"fp16\")];\n"]; | ||
| [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor<string, []>(\"cast_in\")];\n", ch, sp]; | ||
| prev = @"x_to_fp16"; | ||
| } | ||
| NSUInteger cs = 64 + ch*ch*2; | ||
| NSString *prev = @"x_to_fp16"; | ||
| for (int i = 0; i < depth; i++) { | ||
| [m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = string(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n", | ||
| [m appendFormat:@" tensor<fp16, [%d, %d, 1, 1]> W%d = const()[name = tensor<string, []>(\"W%d\"), val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n", | ||
| ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)]; | ||
| NSString *out = [NSString stringWithFormat:@"c%d", i]; | ||
| [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n", | ||
| [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor<string, []>(\"%@\")];\n", | ||
| ch, sp, out, i, prev, out]; | ||
| prev = out; | ||
| } | ||
| [m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"]; | ||
| [m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev]; | ||
| [m appendString:@" } -> (c);\n}\n"]; | ||
| if (g_fp16_io) { | ||
| [m appendFormat:@" tensor<fp16, [1, %d, 1, %d]> c = identity(x = %@)[name = tensor<string, []>(\"out\")];\n", ch, sp, prev]; | ||
| [m appendString:@" } -> (c);\n}\n"]; | ||
| } else { | ||
| [m appendString:@" tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"]; | ||
| [m appendFormat:@" tensor<fp32, [1, %d, 1, %d]> c = cast(dtype = to_fp32, x = %@)[name = tensor<string, []>(\"cast_out\")];\n", ch, sp, prev]; | ||
| [m appendString:@" } -> (c);\n}\n"]; | ||
| } | ||
| return m; | ||
| } | ||
|
|
||
|
|
@@ -68,9 +86,18 @@ | |
| [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; | ||
| [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; | ||
| [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; | ||
| if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;} | ||
| if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){ | ||
| [fm removeItemAtPath:td error:nil]; | ||
| if (!g_fp16_io) { | ||
| printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); | ||
| g_fp16_io = 1; | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The auto-retry with One edge case to consider: if |
||
| return bench(ch, sp, depth); | ||
| } | ||
| return -3; | ||
| } | ||
| if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;} | ||
| NSUInteger bytes=ch*sp*4; | ||
| size_t bpe = g_fp16_io ? 2 : 4; | ||
| NSUInteger bytes=ch*sp*bpe; | ||
| IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); | ||
| IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); | ||
| id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: the
statickeyword here givesinmem_peak.mits own private copy, which is fine since this file doesn't includeane_mil_gen.h. But it's worth noting the dual pattern: test files usestatic int g_fp16_io(file-local), whileane_mil_gen.hexpectsextern int g_fp16_io(shared). Both work in isolation — just something to be aware of for maintainability.Would be cleaner long-term to pass
fp16_ioas a parameter to the MIL generation functions rather than relying on global mutable state.