From 709b60208fe69c753175e780dd6db978c8aa8b40 Mon Sep 17 00:00:00 2001 From: imperatormk Date: Mon, 2 Mar 2026 22:00:45 +0100 Subject: [PATCH 1/4] Fix MIL syntax for cross-generation ANE compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MIL scalar types used shorthand syntax (string("x"), int32(1)) that only works on M4. Changed to the canonical verbose format that CoreML's own compiler emits (tensor("x"), tensor(1)). Also targets program(1.0) with instead of program(1.3)/, and simplifies buildInfo to just coremlc-version. For conv-based kernels, adds runtime fp16 I/O fallback — M1/M2 ANE doesn't support the cast op (fp32<->fp16), so on first compile failure it retries with native fp16 inputs/outputs and does the conversion on the CPU side. The fallback is persisted across exec() restarts. Note: matmul and scaled_dot_product_attention ops still fail on M1/M2 — these are M4+ ANE ops. The attention tests (test_ane_causal_attn, test_ane_sdpa5, test_full_fused attention part) require M4 hardware. Conv-based kernels (training, QKV projections, FFN) work on all generations. Tested on M1 Pro, macOS 26.3 (Tahoe). --- .gitignore | 7 + training/ane_mil_gen.h | 238 ++++++++++++------ training/stories_mil.h | 412 ++++++++++++++++---------------- training/test_ane_advanced.m | 110 ++++++--- training/test_ane_causal_attn.m | 24 +- training/test_ane_sdpa5.m | 38 ++- training/test_conv_attn3.m | 22 +- training/test_full_fused.m | 124 +++++----- training/test_fused_bwd.m | 161 +++++++++---- training/test_fused_qkv.m | 203 +++++++++++----- training/test_perf_stats.m | 79 ++++-- training/test_qos_sweep.m | 81 ++++--- training/test_weight_reload.m | 135 +++++++---- training/tiny_train.m | 109 ++++++--- training/tiny_train_old.m | 118 ++++++--- 15 files changed, 1145 insertions(+), 716 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f4b86e8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.o +ane_probe +api_explore +inmem_basic +tiny_train +tiny_train_m1 +train_large diff --git a/training/ane_mil_gen.h b/training/ane_mil_gen.h index 97fc451..5e205c3 100644 --- a/training/ane_mil_gen.h +++ b/training/ane_mil_gen.h @@ -5,6 +5,9 @@ #include #include +// Set by caller: 1 = fp16 I/O (M1/M2 fallback, no cast ops), 0 = fp32 I/O with cast (M4+) +extern int g_fp16_io; + // Build an FP16 weight blob with the required header structure. // weights_f32: source weights in row-major [out_ch, in_ch] // Returns NSData with header + FP16 weights @@ -30,21 +33,32 @@ static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int i // Input W: [1, out_ch, in_ch] fp32 // Output: [1, out_ch, spatial] fp32 static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x, tensor W) {\n" + " tensor tx = const()[name = tensor(\"tx\"), val = tensor(false)];\n" + " tensor ty = const()[name = tensor(\"ty\"), val = tensor(false)];\n" + " tensor y = matmul(transpose_x = tx, transpose_y = ty, x = W, y = x)[name = tensor(\"mm\")];\n" + " } -> (y);\n" + "}\n", + in_ch, spatial, out_ch, in_ch, out_ch, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x, tensor W) {\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n" - " tensor W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n" - " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" - " bool ty = const()[name = string(\"ty\"), val = bool(false)];\n" - " tensor y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + " func main(tensor x, tensor W) {\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_x\")];\n" + " tensor W16 = cast(dtype = to_fp16, x = W)[name = tensor(\"cast_W\")];\n" + " tensor tx = const()[name = tensor(\"tx\"), val = tensor(false)];\n" + " tensor ty = const()[name = tensor(\"ty\"), val = tensor(false)];\n" + " tensor y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = tensor(\"mm\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = tensor(\"cast_out\")];\n" " } -> (y);\n" "}\n", in_ch, spatial, out_ch, in_ch, @@ -54,26 +68,45 @@ static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) { // Keep the baked-weight version for reference (used in inference-only scenarios) static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor y = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x)[name = tensor(\"conv\")];\n" + " } -> (y);\n" + "}\n", + in_ch, spatial, + out_ch, in_ch, out_ch, in_ch, + out_ch, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_in\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" " tensor y16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = tensor(\"conv\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = to_fp32, x = y16)[name = tensor(\"cast_out\")];\n" " } -> (y);\n" "}\n", in_ch, spatial, in_ch, spatial, @@ -88,36 +121,65 @@ static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) { // where cs = 64 + dim*dim*2 static NSString *mil_gen_qkv(int dim, int spatial) { NSUInteger cs = 64 + (NSUInteger)dim * dim * 2; + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor q = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x)[name = tensor(\"conv_q\")];\n" + " tensor k = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x)[name = tensor(\"conv_k\")];\n" + " tensor v = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x)[name = tensor(\"conv_v\")];\n" + " } -> (q, k, v);\n" + "}\n", + dim, spatial, + dim, dim, dim, dim, + dim, dim, dim, dim, (unsigned long)(64 + cs), + dim, dim, dim, dim, (unsigned long)(64 + 2*cs), + dim, spatial, dim, spatial, dim, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor Wq = const()[name = string(\"Wq\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " tensor Wk = const()[name = string(\"Wk\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" - " tensor Wv = const()[name = string(\"Wv\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_in\")];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" " tensor q16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = tensor(\"conv_q\")];\n" " tensor k16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = tensor(\"conv_k\")];\n" " tensor v16 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n" - " tensor k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n" - " tensor v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = tensor(\"conv_v\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor q = cast(dtype = to_fp32, x = q16)[name = tensor(\"cast_q\")];\n" + " tensor k = cast(dtype = to_fp32, x = k16)[name = tensor(\"cast_k\")];\n" + " tensor v = cast(dtype = to_fp32, x = v16)[name = tensor(\"cast_v\")];\n" " } -> (q, k, v);\n" "}\n", dim, spatial, dim, spatial, @@ -173,31 +235,55 @@ static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, in // Generate MIL for fused FFN up: w1 + w3 parallel convs static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) { NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2; + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" + "{\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor W1 = const()[name = tensor(\"W1\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor W3 = const()[name = tensor(\"W3\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" + " tensor out1 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x)[name = tensor(\"conv_w1\")];\n" + " tensor out3 = conv(dilations = c_dilations, groups = c_groups, " + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x)[name = tensor(\"conv_w3\")];\n" + " } -> (out1, out3);\n" + "}\n", + dim, spatial, + hidden_dim, dim, hidden_dim, dim, + hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs), + hidden_dim, spatial, hidden_dim, spatial]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n" "{\n" - " func main(tensor x) {\n" - " string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n" - " tensor c_strides = const()[name = string(\"c_strides\"), val = tensor([1, 1])];\n" - " tensor c_pad = const()[name = string(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" - " tensor c_dilations = const()[name = string(\"c_dilations\"), val = tensor([1, 1])];\n" - " int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n" - " string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n" - " tensor W1 = const()[name = string(\"W1\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " tensor W3 = const()[name = string(\"W3\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n" + " func main(tensor x) {\n" + " tensor c_pad_type = const()[name = tensor(\"c_pad_type\"), val = tensor(\"valid\")];\n" + " tensor c_strides = const()[name = tensor(\"c_strides\"), val = tensor([1, 1])];\n" + " tensor c_pad = const()[name = tensor(\"c_pad\"), val = tensor([0, 0, 0, 0])];\n" + " tensor c_dilations = const()[name = tensor(\"c_dilations\"), val = tensor([1, 1])];\n" + " tensor c_groups = const()[name = tensor(\"c_groups\"), val = tensor(1)];\n" + " tensor to_fp16 = const()[name = tensor(\"to_fp16\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = to_fp16, x = x)[name = tensor(\"cast_in\")];\n" + " tensor W1 = const()[name = tensor(\"W1\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor W3 = const()[name = tensor(\"W3\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n" " tensor h1 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = tensor(\"conv_w1\")];\n" " tensor h3 = conv(dilations = c_dilations, groups = c_groups, " - "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n" - " string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n" - " tensor out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n" - " tensor out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n" + "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = tensor(\"conv_w3\")];\n" + " tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n" + " tensor out1 = cast(dtype = to_fp32, x = h1)[name = tensor(\"cast_h1\")];\n" + " tensor out3 = cast(dtype = to_fp32, x = h3)[name = tensor(\"cast_h3\")];\n" " } -> (out1, out3);\n" "}\n", dim, spatial, dim, spatial, diff --git a/training/stories_mil.h b/training/stories_mil.h index dccca44..23f222a 100644 --- a/training/stories_mil.h +++ b/training/stories_mil.h @@ -4,15 +4,13 @@ #include "stories_io.h" #define MIL_HDR \ - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \ - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \ - "{\"coremltools-version\", \"9.0\"}})]\n{\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" #define CONV_CONST \ - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \ - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" \ - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" \ - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" \ - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" \ + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" \ + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" \ + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" \ + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" // SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm) static NSString *gen_sdpa_fwd_taps(void) { @@ -20,53 +18,53 @@ static NSString *gen_sdpa_fwd_taps(void) { float invd = 1.0f/(float)DIM; NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; - [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; - [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; - [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; - [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; - [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; - [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; - [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; - [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; - [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM]; - [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=tensor(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=tensor(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" tensor kd = const()[name=tensor(\"kd\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor(\"ss\")];\n", SEQ]; + [m appendFormat:@" tensor invd = const()[name=tensor(\"invd\"), val=tensor(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=tensor(\"ss2\")];\n", SEQ]; + [m appendFormat:@" tensor eps = const()[name=tensor(\"eps\"), val=tensor(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=tensor(\"ss3\")];\n", SEQ]; + [m appendFormat:@" tensor nhalf = const()[name=tensor(\"nhalf\"), val=tensor(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=tensor(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=tensor(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=tensor(\"rw\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/rms1.bin\"), offset=tensor(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=tensor(\"xn\")];\n", DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor Wq = const()[name=string(\"Wq\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wk = const()[name=string(\"Wk\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wv = const()[name=string(\"Wv\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wo = const()[name=string(\"Wo\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor qsh = const()[name=string(\"qsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; - [m appendString:@" bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"]; - [m appendString:@" bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; - [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; - [m appendFormat:@" tensor aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor os = const()[name=string(\"os\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ]; + [m appendFormat:@" tensor Wq = const()[name=tensor(\"Wq\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wq.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wk = const()[name=tensor(\"Wk\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wk.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wv = const()[name=tensor(\"Wv\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wv.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wo = const()[name=tensor(\"Wo\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wo.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=tensor(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=tensor(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=tensor(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor qsh = const()[name=tensor(\"qsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=tensor(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor q4 = reshape(shape=qsh,x=qf)[name=tensor(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=q4)[name=tensor(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor k4 = reshape(shape=qsh,x=kf)[name=tensor(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=k4)[name=tensor(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor v4 = reshape(shape=qsh,x=vf)[name=tensor(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=v4)[name=tensor(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendString:@" tensor tx = const()[name=tensor(\"tx\"), val=tensor(false)];\n"]; + [m appendString:@" tensor ty = const()[name=tensor(\"ty\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=tensor(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor scv = const()[name=tensor(\"scv\"), val=tensor(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=tensor(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=tensor(\"cm\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/mask.bin\"), offset=tensor(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=tensor(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor sax = const()[name=tensor(\"sax\"), val=tensor(-1)];\n"]; + [m appendFormat:@" tensor aw = softmax(axis=sax,x=ms)[name=tensor(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=tensor(\"mm2\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor at = transpose(perm=pm,x=a4)[name=tensor(\"ta\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor os = const()[name=tensor(\"os\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor af = reshape(shape=os,x=at)[name=tensor(\"ra\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=tensor(\"co\")];\n", DIM,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=tensor(\"cat\")];\n", 6*DIM,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -76,33 +74,33 @@ static NSString *gen_ffn_fwd_taps(void) { float invd = 1.0f/(float)DIM; NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; - [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rax = const()[name=string(\"rax\"), val=tensor([1])];\n"]; - [m appendFormat:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ]; - [m appendFormat:@" fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd]; - [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ]; - [m appendFormat:@" fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"]; - [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ]; - [m appendFormat:@" fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"]; - [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ]; - [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor rw = const()[name=string(\"rw\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM]; - [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", DIM, SEQ]; + [m appendFormat:@" tensor sq = mul(x=x,y=x)[name=tensor(\"sq\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rax = const()[name=tensor(\"rax\"), val=tensor([1])];\n"]; + [m appendFormat:@" tensor kd = const()[name=tensor(\"kd\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor(\"ss\")];\n", SEQ]; + [m appendFormat:@" tensor invd = const()[name=tensor(\"invd\"), val=tensor(%f)];\n", invd]; + [m appendFormat:@" tensor ss2 = mul(x=ss,y=invd)[name=tensor(\"ss2\")];\n", SEQ]; + [m appendFormat:@" tensor eps = const()[name=tensor(\"eps\"), val=tensor(0.00001)];\n"]; + [m appendFormat:@" tensor ss3 = add(x=ss2,y=eps)[name=tensor(\"ss3\")];\n", SEQ]; + [m appendFormat:@" tensor nhalf = const()[name=tensor(\"nhalf\"), val=tensor(-0.5)];\n"]; + [m appendFormat:@" tensor rrms = pow(x=ss3,y=nhalf)[name=tensor(\"rrms\")];\n", SEQ]; + [m appendFormat:@" tensor xr = mul(x=x,y=rrms)[name=tensor(\"xr\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor rw = const()[name=tensor(\"rw\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/rms2.bin\"), offset=tensor(64)))];\n", DIM, DIM]; + [m appendFormat:@" tensor xn = mul(x=xr,y=rw)[name=tensor(\"xn\")];\n", DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor W1 = const()[name=string(\"W1\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; - [m appendFormat:@" tensor W3 = const()[name=string(\"W3\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; - [m appendFormat:@" tensor W2 = const()[name=string(\"W2\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN]; - [m appendFormat:@" tensor h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ]; - [m appendFormat:@" tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ]; + [m appendFormat:@" tensor W1 = const()[name=tensor(\"W1\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w1.bin\"), offset=tensor(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W3 = const()[name=tensor(\"W3\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w3.bin\"), offset=tensor(64)))];\n", HIDDEN,DIM,HIDDEN,DIM]; + [m appendFormat:@" tensor W2 = const()[name=tensor(\"W2\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w2.bin\"), offset=tensor(64)))];\n", DIM,HIDDEN,DIM,HIDDEN]; + [m appendFormat:@" tensor h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=tensor(\"c1\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=tensor(\"c3\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=tensor(\"sg\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor silu = mul(x=h1,y=sig)[name=tensor(\"si\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor gate = mul(x=silu,y=h3)[name=tensor(\"gt\")];\n", HIDDEN,SEQ]; + [m appendFormat:@" tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=tensor(\"c2\")];\n", DIM,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=tensor(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -111,36 +109,36 @@ static NSString *gen_ffn_fwd_taps(void) { static NSString *gen_ffn_bwd(void) { NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", DIM+2*HIDDEN, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", DIM+2*HIDDEN, SEQ]; [m appendString:@CONV_CONST]; - [m appendString:@" tensor bd = const()[name=string(\"bd\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor sd = const()[name=string(\"sd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendFormat:@" tensor dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor s1 = const()[name=string(\"s1\"), val=tensor([1,%d,1,%d])];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", DIM+HIDDEN]; - [m appendFormat:@" tensor h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor W2t = const()[name=string(\"W2t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM]; - [m appendFormat:@" tensor dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ]; - [m appendString:@" fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"]; - [m appendFormat:@" tensor oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ]; - [m appendFormat:@" tensor W1t = const()[name=string(\"W1t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; - [m appendFormat:@" tensor W3t = const()[name=string(\"W3t\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; - [m appendFormat:@" tensor dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ]; - [m appendFormat:@" tensor dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ]; + [m appendString:@" tensor bd = const()[name=tensor(\"bd\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor sd = const()[name=tensor(\"sd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor dffn = slice_by_size(x=x,begin=bd,size=sd)[name=tensor(\"s0\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor s1 = const()[name=tensor(\"s1\"), val=tensor([1,%d,1,%d])];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor h1 = slice_by_size(x=x,begin=b1,size=s1)[name=tensor(\"s1x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor b3 = const()[name=tensor(\"b3\"), val=tensor([0,%d,0,0])];\n", DIM+HIDDEN]; + [m appendFormat:@" tensor h3 = slice_by_size(x=x,begin=b3,size=s1)[name=tensor(\"s3x\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W2t = const()[name=tensor(\"W2t\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w2t.bin\"), offset=tensor(64)))];\n", HIDDEN, DIM, HIDDEN, DIM]; + [m appendFormat:@" tensor dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=tensor(\"cw2\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor sig = sigmoid(x=h1)[name=tensor(\"sg\")];\n", HIDDEN, SEQ]; + [m appendString:@" tensor one = const()[name=tensor(\"one\"), val=tensor(1.0)];\n"]; + [m appendFormat:@" tensor oms = sub(x=one,y=sig)[name=tensor(\"oms\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor homs = mul(x=h1,y=oms)[name=tensor(\"homs\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor brk = add(x=one,y=homs)[name=tensor(\"brk\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dsd = mul(x=sig,y=brk)[name=tensor(\"dsd\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor t1 = mul(x=dsilu,y=h3)[name=tensor(\"t1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh1 = mul(x=t1,y=dsd)[name=tensor(\"dh1\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor slh = mul(x=h1,y=sig)[name=tensor(\"slh\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor dh3 = mul(x=dsilu,y=slh)[name=tensor(\"dh3\")];\n", HIDDEN, SEQ]; + [m appendFormat:@" tensor W1t = const()[name=tensor(\"W1t\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w1t.bin\"), offset=tensor(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor W3t = const()[name=tensor(\"W3t\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/w3t.bin\"), offset=tensor(64)))];\n", DIM, HIDDEN, DIM, HIDDEN]; + [m appendFormat:@" tensor dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=tensor(\"cw1\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=tensor(\"cw3\")];\n", DIM, SEQ]; + [m appendFormat:@" tensor dx = add(x=dx1,y=dx3)[name=tensor(\"adx\")];\n", DIM, SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=tensor(\"cat\")];\n", DIM+2*HIDDEN, SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -149,23 +147,23 @@ static NSString *gen_ffn_bwd(void) { static NSString *gen_qkvb(void) { NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", 3*DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", 3*DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; - [m appendFormat:@" tensor dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor Wqt = const()[name=string(\"Wqt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wkt = const()[name=string(\"Wkt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor Wvt = const()[name=string(\"Wvt\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor sz = const()[name=tensor(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=tensor(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor dq = slice_by_size(x=x,begin=b0,size=sz)[name=tensor(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor dk = slice_by_size(x=x,begin=b1,size=sz)[name=tensor(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=tensor(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor dv = slice_by_size(x=x,begin=b2,size=sz)[name=tensor(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor Wqt = const()[name=tensor(\"Wqt\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wqt.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wkt = const()[name=tensor(\"Wkt\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wkt.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor Wvt = const()[name=tensor(\"Wvt\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wvt.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=tensor(\"cq\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=tensor(\"ck\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=tensor(\"cv\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dxqk = add(x=dxq,y=dxk)[name=tensor(\"aqk\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor out = add(x=dxqk,y=dxv)[name=tensor(\"out\")];\n", DIM,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -175,49 +173,49 @@ static NSString *gen_sdpa_bwd1(void) { float sc = 1.0f/sqrtf((float)HD); NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", 4*DIM, SEQ]; + [m appendFormat:@" func main(tensor x) {\n", 4*DIM, SEQ]; [m appendString:@CONV_CONST]; - [m appendFormat:@" tensor sz = const()[name=string(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; - [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; - [m appendFormat:@" tensor vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 3*DIM]; - [m appendFormat:@" tensor dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor Wot = const()[name=string(\"Wot\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM]; - [m appendFormat:@" tensor df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD]; - [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; - [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor cm = const()[name=string(\"cm\"), val=tensor(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ]; - [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"]; - [m appendFormat:@" tensor probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor dvs = const()[name=string(\"dvs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor scs = const()[name=string(\"scs\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ]; + [m appendFormat:@" tensor sz = const()[name=tensor(\"sz\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendString:@" tensor b0 = const()[name=tensor(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b0,size=sz)[name=tensor(\"s0\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b1,size=sz)[name=tensor(\"s1\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b2 = const()[name=tensor(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*DIM]; + [m appendFormat:@" tensor vf = slice_by_size(x=x,begin=b2,size=sz)[name=tensor(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b3 = const()[name=tensor(\"b3\"), val=tensor([0,%d,0,0])];\n", 3*DIM]; + [m appendFormat:@" tensor dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=tensor(\"s3\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor Wot = const()[name=tensor(\"Wot\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/wot.bin\"), offset=tensor(64)))];\n", DIM,DIM,DIM,DIM]; + [m appendFormat:@" tensor df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=tensor(\"cwo\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=tensor(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=tensor(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=tensor(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=tensor(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=tensor(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=tensor(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor vr = reshape(shape=rsh,x=vf)[name=tensor(\"rv\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor v = transpose(perm=pm,x=vr)[name=tensor(\"tv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dr = reshape(shape=rsh,x=df)[name=tensor(\"rd\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor da = transpose(perm=pm,x=dr)[name=tensor(\"td\")];\n", HEADS,SEQ,HD]; + [m appendString:@" tensor bF = const()[name=tensor(\"bF\"), val=tensor(false)];\n"]; + [m appendString:@" tensor bT = const()[name=tensor(\"bT\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=tensor(\"mm1\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor scv = const()[name=tensor(\"scv\"), val=tensor(%f)];\n", sc]; + [m appendFormat:@" tensor sc2 = mul(x=sc1,y=scv)[name=tensor(\"scl\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor cm = const()[name=tensor(\"cm\"), val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/mask.bin\"), offset=tensor(64)))];\n", SEQ,SEQ,SEQ,SEQ]; + [m appendFormat:@" tensor ms = add(x=sc2,y=cm)[name=tensor(\"msk\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor sax = const()[name=tensor(\"sax\"), val=tensor(-1)];\n"]; + [m appendFormat:@" tensor probs = softmax(axis=sax,x=ms)[name=tensor(\"sm\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=tensor(\"dv\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=tensor(\"dp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dvt = transpose(perm=pm,x=dv4)[name=tensor(\"dvt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dvs = const()[name=tensor(\"dvs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dvf = reshape(shape=dvs,x=dvt)[name=tensor(\"dvf\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor scs = const()[name=tensor(\"scs\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor pf = reshape(shape=scs,x=probs)[name=tensor(\"pf\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor dpf = reshape(shape=scs,x=dp4)[name=tensor(\"dpf\")];\n", SCORE_CH,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=tensor(\"cat\")];\n", DIM+2*SCORE_CH,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } @@ -228,46 +226,46 @@ static NSString *gen_sdpa_bwd2(void) { int bwd2_in = 2*SCORE_CH + 2*DIM; NSMutableString *m = [NSMutableString string]; [m appendString:MIL_HDR]; - [m appendFormat:@" func main(tensor x) {\n", bwd2_in, SEQ]; - [m appendFormat:@" tensor sz_sc = const()[name=string(\"szsc\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH, SEQ]; - [m appendString:@" tensor b0 = const()[name=string(\"b0\"), val=tensor([0,0,0,0])];\n"]; - [m appendFormat:@" tensor pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor b1 = const()[name=string(\"b1\"), val=tensor([0,%d,0,0])];\n", SCORE_CH]; - [m appendFormat:@" tensor dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ]; - [m appendFormat:@" tensor sz_d = const()[name=string(\"szd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; - [m appendFormat:@" tensor b2 = const()[name=string(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH]; - [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor b3 = const()[name=string(\"b3\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH+DIM]; - [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor ssh = const()[name=string(\"ssh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor rsh = const()[name=string(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; - [m appendString:@" tensor pm = const()[name=string(\"pm\"), val=tensor([0,1,3,2])];\n"]; - [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" tensor rax = const()[name=string(\"rax\"), val=tensor([-1])];\n"]; - [m appendString:@" bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ]; - [m appendFormat:@" tensor dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" tensor ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ]; - [m appendFormat:@" fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc]; - [m appendFormat:@" tensor ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ]; - [m appendString:@" bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"]; - [m appendString:@" bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"]; - [m appendFormat:@" tensor dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD]; - [m appendFormat:@" tensor dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ]; - [m appendFormat:@" tensor fs = const()[name=string(\"fs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; - [m appendFormat:@" tensor dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ]; - [m appendFormat:@" tensor dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ]; - [m appendString:@" int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"]; - [m appendString:@" bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"]; - [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ]; + [m appendFormat:@" func main(tensor x) {\n", bwd2_in, SEQ]; + [m appendFormat:@" tensor sz_sc = const()[name=tensor(\"szsc\"), val=tensor([1,%d,1,%d])];\n", SCORE_CH, SEQ]; + [m appendString:@" tensor b0 = const()[name=tensor(\"b0\"), val=tensor([0,0,0,0])];\n"]; + [m appendFormat:@" tensor pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=tensor(\"s0\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor b1 = const()[name=tensor(\"b1\"), val=tensor([0,%d,0,0])];\n", SCORE_CH]; + [m appendFormat:@" tensor dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=tensor(\"s1\")];\n", SCORE_CH,SEQ]; + [m appendFormat:@" tensor sz_d = const()[name=tensor(\"szd\"), val=tensor([1,%d,1,%d])];\n", DIM, SEQ]; + [m appendFormat:@" tensor b2 = const()[name=tensor(\"b2\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH]; + [m appendFormat:@" tensor qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=tensor(\"s2\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor b3 = const()[name=tensor(\"b3\"), val=tensor([0,%d,0,0])];\n", 2*SCORE_CH+DIM]; + [m appendFormat:@" tensor kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=tensor(\"s3\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor ssh = const()[name=tensor(\"ssh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor probs = reshape(shape=ssh,x=pf)[name=tensor(\"rp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor dp = reshape(shape=ssh,x=dpf)[name=tensor(\"rdp\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor rsh = const()[name=tensor(\"rsh\"), val=tensor([1,%d,%d,%d])];\n", HEADS,HD,SEQ]; + [m appendString:@" tensor pm = const()[name=tensor(\"pm\"), val=tensor([0,1,3,2])];\n"]; + [m appendFormat:@" tensor qr = reshape(shape=rsh,x=qf)[name=tensor(\"rq\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor q = transpose(perm=pm,x=qr)[name=tensor(\"tq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor kr = reshape(shape=rsh,x=kf)[name=tensor(\"rk\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor k = transpose(perm=pm,x=kr)[name=tensor(\"tk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor pdp = mul(x=probs,y=dp)[name=tensor(\"pdp\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor rax = const()[name=tensor(\"rax\"), val=tensor([-1])];\n"]; + [m appendString:@" tensor kd = const()[name=tensor(\"kd\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=tensor(\"rs\")];\n", HEADS,SEQ]; + [m appendFormat:@" tensor dps = sub(x=dp,y=spdp)[name=tensor(\"dps\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor ds0 = mul(x=probs,y=dps)[name=tensor(\"ds0\")];\n", HEADS,SEQ,SEQ]; + [m appendFormat:@" tensor scv = const()[name=tensor(\"scv\"), val=tensor(%f)];\n", sc]; + [m appendFormat:@" tensor ds = mul(x=ds0,y=scv)[name=tensor(\"ds\")];\n", HEADS,SEQ,SEQ]; + [m appendString:@" tensor bF = const()[name=tensor(\"bF\"), val=tensor(false)];\n"]; + [m appendString:@" tensor bT = const()[name=tensor(\"bT\"), val=tensor(true)];\n"]; + [m appendFormat:@" tensor dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=tensor(\"dq\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=tensor(\"dk\")];\n", HEADS,SEQ,HD]; + [m appendFormat:@" tensor dqt = transpose(perm=pm,x=dq4)[name=tensor(\"dqt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor dkt = transpose(perm=pm,x=dk4)[name=tensor(\"dkt\")];\n", HEADS,HD,SEQ]; + [m appendFormat:@" tensor fs = const()[name=tensor(\"fs\"), val=tensor([1,%d,1,%d])];\n", DIM,SEQ]; + [m appendFormat:@" tensor dqf = reshape(shape=fs,x=dqt)[name=tensor(\"dqf\")];\n", DIM,SEQ]; + [m appendFormat:@" tensor dkf = reshape(shape=fs,x=dkt)[name=tensor(\"dkf\")];\n", DIM,SEQ]; + [m appendString:@" tensor cax = const()[name=tensor(\"cax\"), val=tensor(1)];\n"]; + [m appendString:@" tensor cid = const()[name=tensor(\"cid\"), val=tensor(false)];\n"]; + [m appendFormat:@" tensor out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=tensor(\"cat\")];\n", 2*DIM,SEQ]; [m appendString:@" } -> (out);\n}\n"]; return m; } diff --git a/training/test_ane_advanced.m b/training/test_ane_advanced.m index 07e9038..06c18e3 100644 --- a/training/test_ane_advanced.m +++ b/training/test_ane_advanced.m @@ -50,6 +50,8 @@ static IOSurfaceRef make_surface(size_t bytes) { (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + int main() { @autoreleasepool { setbuf(stdout, NULL); @@ -106,28 +108,43 @@ int main() { memcpy(blob+128, w, ws); NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + NSFileManager *fm = [NSFileManager defaultManager]; + + retry_compile:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + } NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), @@ -135,23 +152,33 @@ int main() { id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!compiled && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + goto retry_compile; + } ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - int ioBytes = CH * SP * 4; + int ioBytes = CH * SP * (g_fp16_io ? 2 : 4); IOSurfaceRef ioIn = make_surface(ioBytes); IOSurfaceRef ioOut = make_surface(ioBytes); IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)((float)(s+1) * 0.1f); + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f; + } IOSurfaceUnlock(ioIn, 0, NULL); // Baseline eval @@ -165,9 +192,16 @@ int main() { printf(" Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL"); IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut); - float baseline_0 = out0[0], baseline_1 = out0[1]; - printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]); + float baseline_0, baseline_1; + if (g_fp16_io) { + _Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + baseline_0 = (float)out0[0]; baseline_1 = (float)out0[1]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]); + } else { + float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut); + baseline_0 = out0[0]; baseline_1 = out0[1]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]); + } IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); // Test weightsBuffer: IOSurface with 3x identity weights @@ -194,10 +228,18 @@ int main() { printf(" Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL"); if (ok) { IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outW = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]); - bool changed = fabsf(outW[0] - baseline_0) > 0.001f; - bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f; + float outW_0; + if (g_fp16_io) { + _Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + outW_0 = (float)outW[0]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]); + } else { + float *outW = (float*)IOSurfaceGetBaseAddress(ioOut); + outW_0 = outW[0]; + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]); + } + bool changed = fabsf(outW_0 - baseline_0) > 0.001f; + bool is_3x = fabsf(outW_0 - baseline_0 * 3.0f) < 0.1f; printf(" weightsBuffer: output %s", changed ? "CHANGED" : "unchanged"); if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected"); printf("\n"); diff --git a/training/test_ane_causal_attn.m b/training/test_ane_causal_attn.m index cb9b761..d279f96 100644 --- a/training/test_ane_causal_attn.m +++ b/training/test_ane_causal_attn.m @@ -81,13 +81,11 @@ int main() { // === Approach 1: Non-causal SDPA (baseline) === printf("=== Non-causal SDPA (baseline) ===\n"); NSString *sdpa_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; Kern kSDPA = compile_mil(sdpa_mil); @@ -100,13 +98,11 @@ int main() { // scores = Q @ K^T → [1, HEADS, SEQ, SEQ] printf("\n=== Decomposed causal attention ===\n"); NSString *qkt_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k) {\n" " tensor scores = matmul(" - "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n" + "x = q, y = k, transpose_y = true)[name = tensor(\"qkt\")];\n" " } -> (scores);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ]; Kern kQKT = compile_mil(qkt_mil); @@ -114,13 +110,11 @@ int main() { // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD] NSString *sv_mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor s, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor s, " "tensor v) {\n" " tensor out = matmul(" - "x = s, y = v)[name = string(\"sv\")];\n" + "x = s, y = v)[name = tensor(\"sv\")];\n" " } -> (out);\n}\n", HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD]; Kern kSV = compile_mil(sv_mil); diff --git a/training/test_ane_sdpa5.m b/training/test_ane_sdpa5.m index 0ddce84..b348fa4 100644 --- a/training/test_ane_sdpa5.m +++ b/training/test_ane_sdpa5.m @@ -187,13 +187,11 @@ int main() { printf("Test 1: no mask\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD]; Model m = compile_model(mil, nil); @@ -209,14 +207,12 @@ int main() { { NSString *maskStr = build_inline_causal_mask(SEQ); NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" - " %@ mask = const()[name = string(\"mask\"), val = %@];\n" + " %@ mask = const()[name = tensor(\"mask\"), val = %@];\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v, attn_mask = mask)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, [NSString stringWithFormat:@"tensor", SEQ, SEQ], maskStr, @@ -233,15 +229,13 @@ int main() { printf("\nTest 3: BLOBFILE causal mask\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v) {\n" - " tensor mask = const()[name = string(\"mask\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" + " tensor mask = const()[name = tensor(\"mask\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/mask.bin\"), offset = tensor(64)))];\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v, attn_mask = mask)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD]; @@ -258,14 +252,12 @@ int main() { printf("\nTest 4: mask as runtime input\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor q, " + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor q, " "tensor k, tensor v, " "tensor mask) {\n" " tensor att = scaled_dot_product_attention(" - "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n" + "query = q, key = k, value = v, attn_mask = mask)[name = tensor(\"sdpa\")];\n" " } -> (att);\n}\n", HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, SEQ, SEQ, HEADS, SEQ, HD]; diff --git a/training/test_conv_attn3.m b/training/test_conv_attn3.m index a396b4d..301280a 100644 --- a/training/test_conv_attn3.m +++ b/training/test_conv_attn3.m @@ -82,19 +82,17 @@ static void cleanup_kern(Kern *k) { static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) { return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(%d)];\n" " tensor y = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n" + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" " } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp]; } diff --git a/training/test_full_fused.m b/training/test_full_fused.m index 8449ddb..e112d48 100644 --- a/training/test_full_fused.m +++ b/training/test_full_fused.m @@ -130,64 +130,62 @@ int main() { float scale_val = 1.0f / sqrtf((float)HD); NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" // Conv boilerplate - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr1 = const()[name = tensor(\"g1\"), val = tensor(1)];\n" // QKV weights - " tensor Wq = const()[name = string(\"Wq\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n" - " tensor Wk = const()[name = string(\"Wk\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n" - " tensor Wv = const()[name = string(\"Wv\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n" - " tensor Wout = const()[name = string(\"Wo\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wq.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wk.bin\"), offset = tensor(64)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wv.bin\"), offset = tensor(64)))];\n" + " tensor Wout = const()[name = tensor(\"Wo\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wo.bin\"), offset = tensor(64)))];\n" // QKV projections " tensor q_flat = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n" + "pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor(\"cq\")];\n" " tensor k_flat = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n" + "pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor(\"ck\")];\n" " tensor v_flat = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n" + "pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor(\"cv\")];\n" // Reshape: [1, DIM, 1, SEQ] → [1, HEADS, HD, SEQ] → transpose → [1, HEADS, SEQ, HD] - " tensor qsh = const()[name = string(\"qsh\"), val = tensor([1, %d, %d, %d])];\n" - " tensor q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n" - " tensor perm = const()[name = string(\"pm\"), val = tensor([0, 1, 3, 2])];\n" - " tensor q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n" - " tensor k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n" - " tensor k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n" - " tensor v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n" - " tensor v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n" + " tensor qsh = const()[name = tensor(\"qsh\"), val = tensor([1, %d, %d, %d])];\n" + " tensor q_4d = reshape(shape = qsh, x = q_flat)[name = tensor(\"rq\")];\n" + " tensor perm = const()[name = tensor(\"pm\"), val = tensor([0, 1, 3, 2])];\n" + " tensor q = transpose(perm = perm, x = q_4d)[name = tensor(\"tq\")];\n" + " tensor k_4d = reshape(shape = qsh, x = k_flat)[name = tensor(\"rk\")];\n" + " tensor k = transpose(perm = perm, x = k_4d)[name = tensor(\"tk\")];\n" + " tensor v_4d = reshape(shape = qsh, x = v_flat)[name = tensor(\"rv\")];\n" + " tensor v = transpose(perm = perm, x = v_4d)[name = tensor(\"tv\")];\n" // Q @ K^T - " bool ty = const()[name = string(\"ty\"), val = bool(true)];\n" - " bool tx = const()[name = string(\"tx\"), val = bool(false)];\n" - " tensor scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n" + " tensor ty = const()[name = tensor(\"ty\"), val = tensor(true)];\n" + " tensor tx = const()[name = tensor(\"tx\"), val = tensor(false)];\n" + " tensor scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = tensor(\"mm1\")];\n" // Scale - " fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n" - " tensor scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n" + " tensor sc = const()[name = tensor(\"sc\"), val = fp16(%f)];\n" + " tensor scaled = mul(x = scores, y = sc)[name = tensor(\"scl\")];\n" // Causal mask - " tensor cmask = const()[name = string(\"cm\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n" - " tensor masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n" + " tensor cmask = const()[name = tensor(\"cm\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/mask.bin\"), offset = tensor(64)))];\n" + " tensor masked = add(x = scaled, y = cmask)[name = tensor(\"msk\")];\n" // Softmax - " int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n" - " tensor attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n" + " tensor sax = const()[name = tensor(\"sax\"), val = tensor(-1)];\n" + " tensor attn_w = softmax(axis = sax, x = masked)[name = tensor(\"sm\")];\n" // scores @ V - " tensor attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n" + " tensor attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = tensor(\"mm2\")];\n" // Reshape back: [1, HEADS, SEQ, HD] → transpose → [1, HEADS, HD, SEQ] → reshape → [1, DIM, 1, SEQ] - " tensor attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n" - " tensor osh = const()[name = string(\"osh\"), val = tensor([1, %d, 1, %d])];\n" - " tensor attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n" + " tensor attn_t = transpose(perm = perm, x = attn_4d)[name = tensor(\"ta\")];\n" + " tensor osh = const()[name = tensor(\"osh\"), val = tensor([1, %d, 1, %d])];\n" + " tensor attn_flat = reshape(shape = osh, x = attn_t)[name = tensor(\"ra\")];\n" // Wo projection " tensor out = conv(dilations = dl, groups = gr1, pad = pd, " - "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = tensor(\"co\")];\n" " } -> (out);\n}\n", DIM, SEQ, // input DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM, // Wq, Wk @@ -317,30 +315,28 @@ int main() { printf("\n=== Test 2: Fused FFN benchmark ===\n"); { NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - " tensor W1 = const()[name = string(\"W1\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n" - " tensor W3 = const()[name = string(\"W3\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n" - " tensor W2 = const()[name = string(\"W2\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor W1 = const()[name = tensor(\"W1\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w1.bin\"), offset = tensor(64)))];\n" + " tensor W3 = const()[name = tensor(\"W3\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w3.bin\"), offset = tensor(64)))];\n" + " tensor W2 = const()[name = tensor(\"W2\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w2.bin\"), offset = tensor(64)))];\n" " tensor h1 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n" + "pad_type = pt, strides = st, weight = W1, x = x)[name = tensor(\"c1\")];\n" " tensor h3 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n" - " tensor sig = sigmoid(x = h1)[name = string(\"sg\")];\n" - " tensor silu = mul(x = h1, y = sig)[name = string(\"si\")];\n" - " tensor gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n" + "pad_type = pt, strides = st, weight = W3, x = x)[name = tensor(\"c3\")];\n" + " tensor sig = sigmoid(x = h1)[name = tensor(\"sg\")];\n" + " tensor silu = mul(x = h1, y = sig)[name = tensor(\"si\")];\n" + " tensor gate = mul(x = silu, y = h3)[name = tensor(\"gt\")];\n" " tensor out = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n" + "pad_type = pt, strides = st, weight = W2, x = gate)[name = tensor(\"c2\")];\n" " } -> (out);\n}\n", DIM, SEQ, HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN, diff --git a/training/test_fused_bwd.m b/training/test_fused_bwd.m index b91d7b6..831f784 100644 --- a/training/test_fused_bwd.m +++ b/training/test_fused_bwd.m @@ -15,6 +15,8 @@ #define HIDDEN 2048 #define SEQ 64 +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static Class g_D, g_I, g_AR, g_AIO; static void ane_init(void) { dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); @@ -58,47 +60,77 @@ int main() { // MIL: slice input → 2 convs → add printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n"); - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" // [1, HIDDEN*2, 1, SEQ] - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :] - " tensor b1 = const()[name = string(\"b1\"), val = tensor([0, 0, 0, 0])];\n" - " tensor s1 = const()[name = string(\"s1\"), val = tensor([1, %d, 1, %d])];\n" - " tensor dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n" - " tensor b3 = const()[name = string(\"b3\"), val = tensor([0, %d, 0, 0])];\n" - " tensor s3 = const()[name = string(\"s3\"), val = tensor([1, %d, 1, %d])];\n" - " tensor dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n" - // Conv: W1^T @ dh1, W3^T @ dh3 - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - // W1^T: [DIM, HIDDEN, 1, 1] (transposed from [HIDDEN, DIM]) - " tensor W1t = const()[name = string(\"W1t\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n" - " tensor W3t = const()[name = string(\"W3t\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n" - " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n" - " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n" - // Add - " tensor sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n" - " } -> (y);\n}\n", - HIDDEN*2, SEQ, HIDDEN*2, SEQ, - HIDDEN, SEQ, HIDDEN, SEQ, // slice1 - HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, // slice3 - DIM, HIDDEN, DIM, HIDDEN, // W1t - DIM, HIDDEN, DIM, HIDDEN, // W3t - DIM, SEQ, DIM, SEQ, // dx1, dx3 - DIM, SEQ, DIM, SEQ]; // sum, y + retry_compile:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor b1 = const()[name = tensor(\"b1\"), val = tensor([0, 0, 0, 0])];\n" + " tensor s1 = const()[name = tensor(\"s1\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh1 = slice_by_size(x = x, begin = b1, size = s1)[name = tensor(\"sl1\")];\n" + " tensor b3 = const()[name = tensor(\"b3\"), val = tensor([0, %d, 0, 0])];\n" + " tensor s3 = const()[name = tensor(\"s3\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh3 = slice_by_size(x = x, begin = b3, size = s3)[name = tensor(\"sl3\")];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor W1t = const()[name = tensor(\"W1t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w1t.bin\"), offset = tensor(64)))];\n" + " tensor W3t = const()[name = tensor(\"W3t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w3t.bin\"), offset = tensor(64)))];\n" + " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor(\"cv1\")];\n" + " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor(\"cv3\")];\n" + " tensor y = add(x = dx1, y = dx3)[name = tensor(\"ad\")];\n" + " } -> (y);\n}\n", + HIDDEN*2, SEQ, + HIDDEN, SEQ, HIDDEN, SEQ, + HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, + DIM, HIDDEN, DIM, HIDDEN, + DIM, HIDDEN, DIM, HIDDEN, + DIM, SEQ, DIM, SEQ, + DIM, SEQ]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor b1 = const()[name = tensor(\"b1\"), val = tensor([0, 0, 0, 0])];\n" + " tensor s1 = const()[name = tensor(\"s1\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = tensor(\"sl1\")];\n" + " tensor b3 = const()[name = tensor(\"b3\"), val = tensor([0, %d, 0, 0])];\n" + " tensor s3 = const()[name = tensor(\"s3\"), val = tensor([1, %d, 1, %d])];\n" + " tensor dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = tensor(\"sl3\")];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor W1t = const()[name = tensor(\"W1t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w1t.bin\"), offset = tensor(64)))];\n" + " tensor W3t = const()[name = tensor(\"W3t\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w3t.bin\"), offset = tensor(64)))];\n" + " tensor dx1 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor(\"cv1\")];\n" + " tensor dx3 = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor(\"cv3\")];\n" + " tensor sum = add(x = dx1, y = dx3)[name = tensor(\"ad\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = sum)[name = tensor(\"co\")];\n" + " } -> (y);\n}\n", + HIDDEN*2, SEQ, HIDDEN*2, SEQ, + HIDDEN, SEQ, HIDDEN, SEQ, + HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ, + DIM, HIDDEN, DIM, HIDDEN, + DIM, HIDDEN, DIM, HIDDEN, + DIM, SEQ, DIM, SEQ, + DIM, SEQ, DIM, SEQ]; + } NSDictionary *wd = @{ @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)}, @@ -119,6 +151,12 @@ int main() { NSError *e = nil; BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + goto retry_compile; + } printf("Compile: %s\n", ok?"OK":"FAIL"); if (!ok) { printf(" %s\n", e?[[e description] UTF8String]:""); return 1; } ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); @@ -130,13 +168,21 @@ int main() { float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float)); for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); } - IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4); + size_t bpe = g_fp16_io ? 2 : 4; + IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*bpe), ioO = make_surface(DIM*SEQ*bpe); IOSurfaceLock(ioI, 0, NULL); - float *dst = (float*)IOSurfaceGetBaseAddress(ioI); - // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3 - for (int t = 0; t < SEQ; t++) { - for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c]; - for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c]; + if (g_fp16_io) { + _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioI); + for (int t = 0; t < SEQ; t++) { + for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = (_Float16)dh1[t*HIDDEN+c]; + for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = (_Float16)dh3[t*HIDDEN+c]; + } + } else { + float *dst = (float*)IOSurfaceGetBaseAddress(ioI); + for (int t = 0; t < SEQ; t++) { + for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c]; + for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c]; + } } IOSurfaceUnlock(ioI, 0, NULL); @@ -164,13 +210,22 @@ int main() { } IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL); - float *src = (float*)IOSurfaceGetBaseAddress(ioO); float maxd = 0; - for (int t = 0; t < SEQ; t++) - for (int c = 0; c < DIM; c++) { - float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]); - if (d > maxd) maxd = d; - } + if (g_fp16_io) { + _Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(ioO); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) { + float d = fabsf((float)src[c*SEQ+t] - ref[t*DIM+c]); + if (d > maxd) maxd = d; + } + } else { + float *src = (float*)IOSurfaceGetBaseAddress(ioO); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) { + float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]); + if (d > maxd) maxd = d; + } + } IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL); printf("dx max diff: %.6f\n", maxd); diff --git a/training/test_fused_qkv.m b/training/test_fused_qkv.m index 69f41d6..f5758c0 100644 --- a/training/test_fused_qkv.m +++ b/training/test_fused_qkv.m @@ -12,6 +12,8 @@ #define DIM 768 #define SEQ 64 +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static Class g_D, g_I, g_AR, g_AIO; static mach_timebase_info_data_t g_tb; static void ane_init(void) { @@ -56,7 +58,10 @@ static Kern compile_mil(NSString *mil, NSDictionary *wd) { } NSError *e = nil; if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { - printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k; + printf("compile %s: %s\n", g_fp16_io ? "FAIL" : "failed (will retry)", + e ? [[e localizedDescription] UTF8String] : ""); + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + return k; } ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); k.model = mdl; k.td = td; @@ -85,67 +90,108 @@ static void cleanup_kern(Kern *k) { // Fused QKV: 3 convs + concat in one MIL static NSString *gen_fused_qkv_mil(void) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wq.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wk.bin\"), offset = tensor(64)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wv.bin\"), offset = tensor(64)))];\n" + " tensor q = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor(\"cq\")];\n" + " tensor k = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor(\"ck\")];\n" + " tensor v = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor(\"cv\")];\n" + " tensor ax = const()[name = tensor(\"ax\"), val = tensor(1)];\n" + " tensor inter = const()[name = tensor(\"il\"), val = tensor(false)];\n" + " tensor y = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor(\"cat\")];\n" + " } -> (y);\n}\n", + DIM, SEQ, + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, SEQ, DIM, SEQ, DIM, SEQ, + DIM*3, SEQ]; + } return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" - " tensor Wq = const()[name = string(\"Wq\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n" - " tensor Wk = const()[name = string(\"Wk\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n" - " tensor Wv = const()[name = string(\"Wv\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor Wq = const()[name = tensor(\"Wq\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wq.bin\"), offset = tensor(64)))];\n" + " tensor Wk = const()[name = tensor(\"Wk\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wk.bin\"), offset = tensor(64)))];\n" + " tensor Wv = const()[name = tensor(\"Wv\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/wv.bin\"), offset = tensor(64)))];\n" " tensor q = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = Wq, x = x16)[name = string(\"cq\")];\n" + "pad_type = pt, strides = st, weight = Wq, x = x16)[name = tensor(\"cq\")];\n" " tensor k = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = Wk, x = x16)[name = string(\"ck\")];\n" + "pad_type = pt, strides = st, weight = Wk, x = x16)[name = tensor(\"ck\")];\n" " tensor v = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = Wv, x = x16)[name = string(\"cv\")];\n" - " int32 ax = const()[name = string(\"ax\"), val = int32(1)];\n" - " bool inter = const()[name = string(\"il\"), val = bool(false)];\n" - " tensor qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = string(\"cat\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = Wv, x = x16)[name = tensor(\"cv\")];\n" + " tensor ax = const()[name = tensor(\"ax\"), val = tensor(1)];\n" + " tensor inter = const()[name = tensor(\"il\"), val = tensor(false)];\n" + " tensor qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor(\"cat\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = qkv)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", DIM, SEQ, DIM, SEQ, - DIM, DIM, DIM, DIM, // Wq - DIM, DIM, DIM, DIM, // Wk - DIM, DIM, DIM, DIM, // Wv - DIM, SEQ, // q - DIM, SEQ, // k - DIM, SEQ, // v - DIM*3, SEQ, // concat - DIM*3, SEQ]; // output + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, DIM, DIM, DIM, + DIM, SEQ, DIM, SEQ, DIM, SEQ, + DIM*3, SEQ, DIM*3, SEQ]; } // Single conv MIL for comparison static NSString *gen_single_mil(void) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor y = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" + " } -> (y);\n}\n", + DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ]; + } return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/w.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor(\"cv\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ]; } @@ -170,12 +216,18 @@ int main() { for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1); // === Compile fused QKV === + retry_compile:; NSDictionary *fused_wd = @{ @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)}, @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)}, @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)}, }; Kern kFused = compile_mil(gen_fused_qkv_mil(), fused_wd); + if (!kFused.model && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + goto retry_compile; + } printf("Fused QKV: %s\n", kFused.model ? "OK" : "FAIL"); // === Compile 3 separate === @@ -187,16 +239,24 @@ int main() { if (!kFused.model || !kQ.model) goto done; // IOSurfaces - size_t in_bytes = DIM*SEQ*4, out1_bytes = DIM*SEQ*4, out3_bytes = DIM*3*SEQ*4; + size_t bpe = g_fp16_io ? 2 : 4; + size_t in_bytes = DIM*SEQ*bpe, out1_bytes = DIM*SEQ*bpe, out3_bytes = DIM*3*SEQ*bpe; IOSurfaceRef ioIn = make_surface(in_bytes); IOSurfaceRef ioFused = make_surface(out3_bytes); IOSurfaceRef ioQ = make_surface(out1_bytes), ioK = make_surface(out1_bytes), ioV = make_surface(out1_bytes); IOSurfaceLock(ioIn, 0, NULL); - float *dst = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int t = 0; t < SEQ; t++) - for (int c = 0; c < DIM; c++) - dst[c*SEQ+t] = x[t*DIM+c]; + if (g_fp16_io) { + _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) + dst[c*SEQ+t] = (_Float16)x[t*DIM+c]; + } else { + float *dst = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int t = 0; t < SEQ; t++) + for (int c = 0; c < DIM; c++) + dst[c*SEQ+t] = x[t*DIM+c]; + } IOSurfaceUnlock(ioIn, 0, NULL); // Eval fused @@ -212,17 +272,30 @@ int main() { IOSurfaceLock(ioQ, kIOSurfaceLockReadOnly, NULL); IOSurfaceLock(ioK, kIOSurfaceLockReadOnly, NULL); IOSurfaceLock(ioV, kIOSurfaceLockReadOnly, NULL); - float *fo = (float*)IOSurfaceGetBaseAddress(ioFused); - float *qo = (float*)IOSurfaceGetBaseAddress(ioQ); - float *ko = (float*)IOSurfaceGetBaseAddress(ioK); - float *vo = (float*)IOSurfaceGetBaseAddress(ioV); float dq=0, dk=0, dv=0; - for (int c = 0; c < DIM; c++) - for (int t = 0; t < SEQ; t++) { - float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1; - float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2; - float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3; - } + if (g_fp16_io) { + _Float16 *fo = (_Float16*)IOSurfaceGetBaseAddress(ioFused); + _Float16 *qo = (_Float16*)IOSurfaceGetBaseAddress(ioQ); + _Float16 *ko = (_Float16*)IOSurfaceGetBaseAddress(ioK); + _Float16 *vo = (_Float16*)IOSurfaceGetBaseAddress(ioV); + for (int c = 0; c < DIM; c++) + for (int t = 0; t < SEQ; t++) { + float d1 = fabsf((float)fo[c*SEQ+t] - (float)qo[c*SEQ+t]); if(d1>dq) dq=d1; + float d2 = fabsf((float)fo[(DIM+c)*SEQ+t] - (float)ko[c*SEQ+t]); if(d2>dk) dk=d2; + float d3 = fabsf((float)fo[(DIM*2+c)*SEQ+t] - (float)vo[c*SEQ+t]); if(d3>dv) dv=d3; + } + } else { + float *fo = (float*)IOSurfaceGetBaseAddress(ioFused); + float *qo = (float*)IOSurfaceGetBaseAddress(ioQ); + float *ko = (float*)IOSurfaceGetBaseAddress(ioK); + float *vo = (float*)IOSurfaceGetBaseAddress(ioV); + for (int c = 0; c < DIM; c++) + for (int t = 0; t < SEQ; t++) { + float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1; + float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2; + float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3; + } + } IOSurfaceUnlock(ioFused, kIOSurfaceLockReadOnly, NULL); IOSurfaceUnlock(ioQ, kIOSurfaceLockReadOnly, NULL); IOSurfaceUnlock(ioK, kIOSurfaceLockReadOnly, NULL); diff --git a/training/test_perf_stats.m b/training/test_perf_stats.m index cf7b073..b1f903a 100644 --- a/training/test_perf_stats.m +++ b/training/test_perf_stats.m @@ -10,6 +10,8 @@ static mach_timebase_info_data_t g_tb; static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static void dump_class(const char *name) { Class cls = NSClassFromString([NSString stringWithUTF8String:name]); if (!cls) { printf(" %s: NOT FOUND\n", name); return; } @@ -118,28 +120,43 @@ int main() { NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; free(w); - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + retry_compile:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n" + "[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + } NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:), @@ -153,10 +170,15 @@ int main() { [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!compiled && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + goto retry_compile; + } ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); - int ioBytes = CH * SP * 4; // fp32 + int ioBytes = CH * SP * (g_fp16_io ? 2 : 4); IOSurfaceRef ioIn = make_surface(ioBytes); IOSurfaceRef ioOut = make_surface(ioBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); @@ -174,8 +196,13 @@ int main() { if (req) { IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + } IOSurfaceUnlock(ioIn, 0, NULL); BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( diff --git a/training/test_qos_sweep.m b/training/test_qos_sweep.m index 2802c6b..9afe1c3 100644 --- a/training/test_qos_sweep.m +++ b/training/test_qos_sweep.m @@ -10,6 +10,8 @@ static mach_timebase_info_data_t g_tb; static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static IOSurfaceRef make_surface(size_t bytes) { return IOSurfaceCreate((__bridge CFDictionaryRef)@{ (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, @@ -38,37 +40,49 @@ int main() { for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50)); NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; - NSString *mil = [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" - " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; - - NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}; - NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; NSFileManager *fm = [NSFileManager defaultManager]; printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n"); printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6); printf("%4s %10s %10s %10s %10s %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status"); + retry_mil:; + NSString *mil; + if (g_fp16_io) { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP]; + } else { + mil = [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP]; + } + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63}; int n_qos = sizeof(qos_values)/sizeof(qos_values[0]); @@ -98,6 +112,12 @@ int main() { double cms = tb_ms(mach_absolute_time() - t0); if (!cok) { + if (!g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + goto retry_mil; + } printf("%4u %10s %10s %10s %10s COMPILE_FAIL\n", qos, "-", "-", "-", "-"); [fm removeItemAtPath:td error:nil]; continue; @@ -115,7 +135,7 @@ int main() { continue; } - int ioBytes = CH * SP * 4; + int ioBytes = CH * SP * (g_fp16_io ? 2 : 4); IOSurfaceRef ioIn = make_surface(ioBytes); IOSurfaceRef ioOut = make_surface(ioBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); @@ -125,8 +145,13 @@ int main() { @[wI], @[@0], @[wO], @[@0], nil, nil, @0); IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)0.5f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f; + } IOSurfaceUnlock(ioIn, 0, NULL); t0 = mach_absolute_time(); diff --git a/training/test_weight_reload.m b/training/test_weight_reload.m index a248005..b3161bd 100644 --- a/training/test_weight_reload.m +++ b/training/test_weight_reload.m @@ -34,30 +34,42 @@ static IOSurfaceRef make_surface(size_t bytes) { return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES]; } -// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + +// Generate MIL for a simple conv (fp16 I/O when g_fp16_io, else fp32 with casts) static NSString *gen_mil(int ch, int sp) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; + } return [NSString stringWithFormat: - @"program(1.3)\n" - "[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n" - "{\n" - " func main(tensor x) {\n" - " string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" - " tensor st = const()[name=string(\"st\"), val=tensor([1,1])];\n" - " tensor pd = const()[name=string(\"pd\"), val=tensor([0,0,0,0])];\n" - " tensor dl = const()[name=string(\"dl\"), val=tensor([1,1])];\n" - " int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n" - " string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n" - " tensor x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n" - " tensor W = const()[name=string(\"W\"), " - "val=tensor(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" - "[name=string(\"conv\")];\n" - " string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n" - " tensor y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n" - " } -> (y);\n" - "}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; } int main() { @@ -88,6 +100,9 @@ int main() { for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f; NSData *wdataA = build_weight_blob(weightsA, CH, CH); + NSFileManager *fm = [NSFileManager defaultManager]; + + retry_compile:; NSString *mil = gen_mil(CH, SP); NSDictionary *weights = @{ @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA} @@ -103,13 +118,18 @@ int main() { id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc); id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; - NSFileManager *fm = [NSFileManager defaultManager]; [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok && !g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + goto retry_compile; + } if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; } ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; } @@ -117,9 +137,10 @@ int main() { printf(" Compile+load: %.1fms\n", compile_ms); printf(" tmpDir: %s\n", [td UTF8String]); - // Build request and IOSurfaces (fp32 I/O) - int inBytes = CH * SP * 4; // fp32 - int outBytes = CH * SP * 4; + // Build request and IOSurfaces + size_t bpe = g_fp16_io ? 2 : 4; + int inBytes = CH * SP * bpe; + int outBytes = CH * SP * bpe; IOSurfaceRef ioIn = make_surface(inBytes); IOSurfaceRef ioOut = make_surface(outBytes); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn); @@ -130,10 +151,17 @@ int main() { // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01 IOSurfaceLock(ioIn, 0, NULL); - float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) - for (int s = 0; s < SP; s++) - inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f); + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + } IOSurfaceUnlock(ioIn, 0, NULL); // Eval with weights A @@ -142,13 +170,17 @@ int main() { if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outA = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]); + float *outA_copy = (float*)malloc(CH * SP * sizeof(float)); + if (g_fp16_io) { + _Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + for (int i = 0; i < CH*SP; i++) outA_copy[i] = (float)outA[i]; + } else { + float *outA = (float*)IOSurfaceGetBaseAddress(ioOut); + memcpy(outA_copy, outA, CH * SP * sizeof(float)); + } + printf(" Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA_copy[0], outA_copy[1], outA_copy[2], outA_copy[3]); printf(" Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, - outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]); - // Save copy - float *outA_copy = (float*)malloc(outBytes); - memcpy(outA_copy, outA, outBytes); + outA_copy[CH*SP-4], outA_copy[CH*SP-3], outA_copy[CH*SP-2], outA_copy[CH*SP-1]); IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); // === Step 3: Overwrite weight file with B, unload+load === @@ -189,10 +221,17 @@ int main() { // Re-write same input IOSurfaceLock(ioIn, 0, NULL); - inp = (float*)IOSurfaceGetBaseAddress(ioIn); - for (int c = 0; c < CH; c++) - for (int s = 0; s < SP; s++) - inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + if (g_fp16_io) { + _Float16 *inp2 = (_Float16*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp2[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f); + } else { + float *inp2 = (float*)IOSurfaceGetBaseAddress(ioIn); + for (int c = 0; c < CH; c++) + for (int s = 0; s < SP; s++) + inp2[c*SP+s] = (float)(c*SP + s + 1) * 0.01f; + } IOSurfaceUnlock(ioIn, 0, NULL); // Eval with (possibly reloaded) weights B @@ -201,16 +240,23 @@ int main() { if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; } IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL); - float *outB = (float*)IOSurfaceGetBaseAddress(ioOut); - printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]); + float *outB_f = (float*)malloc(CH * SP * sizeof(float)); + if (g_fp16_io) { + _Float16 *outB = (_Float16*)IOSurfaceGetBaseAddress(ioOut); + for (int i = 0; i < CH*SP; i++) outB_f[i] = (float)outB[i]; + } else { + float *outB = (float*)IOSurfaceGetBaseAddress(ioOut); + memcpy(outB_f, outB, CH * SP * sizeof(float)); + } + printf(" Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB_f[0], outB_f[1], outB_f[2], outB_f[3]); printf(" Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1, - outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]); + outB_f[CH*SP-4], outB_f[CH*SP-3], outB_f[CH*SP-2], outB_f[CH*SP-1]); // Check: did the output change? bool changed = false; float max_diff = 0; for (int i = 0; i < CH*SP; i++) { - float d = fabsf(outB[i] - outA_copy[i]); + float d = fabsf(outB_f[i] - outA_copy[i]); if (d > max_diff) max_diff = d; if (d > 0.001f) changed = true; } @@ -219,11 +265,12 @@ int main() { float max_3x_err = 0; for (int i = 0; i < CH*SP; i++) { float expected = outA_copy[i] * 3.0f; - float err = fabsf(outB[i] - expected); + float err = fabsf(outB_f[i] - expected); if (err > max_3x_err) max_3x_err = err; if (err > 0.1f) correct_3x = false; } IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL); + free(outB_f); printf("\n=== RESULT ===\n"); printf(" Max A-B diff: %.6f\n", max_diff); diff --git a/training/tiny_train.m b/training/tiny_train.m index e1e9d7d..7aab4cd 100644 --- a/training/tiny_train.m +++ b/training/tiny_train.m @@ -59,25 +59,43 @@ static IOSurfaceRef make_surface(size_t bytes) { return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + if (g_fp16_io) { + // fp16 I/O path — no cast ops (M1/M2 compatible) + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor y = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" + " } -> (y);\n}\n", + in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp]; + } + // fp32 I/O path — cast to/from fp16 internally (M4+ native) return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor(\"cv\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; } @@ -106,10 +124,19 @@ static IOSurfaceRef make_surface(size_t bytes) { [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + if (!g_fp16_io) { + // M1/M2 ANE doesn't support cast op — retry with fp16 I/O + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + return compile_kern_with_blob(blob, in_ch, out_ch, sp); + } + return NULL; + } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; __sync_fetch_and_add(&g_compile_count, 1); - size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + size_t bpe = g_fp16_io ? 2 : 4; + size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe; IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); @@ -140,27 +167,43 @@ static void free_kern(Kern *k) { } static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { - float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); - for (int t = 0; t < sp; t++) - for (int c = 0; c < in_ch; c++) - tmp[c*sp + t] = in[t*in_ch + c]; + // Transpose [S,C] -> [C,S] and write to IOSurface IOSurfaceLock(k->ioIn, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + void *base_in = IOSurfaceGetBaseAddress(k->ioIn); + if (g_fp16_io) { + _Float16 *dst = (_Float16*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = (_Float16)in[t*in_ch + c]; + } else { + float *dst = (float*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = in[t*in_ch + c]; + } IOSurfaceUnlock(k->ioIn, 0, NULL); - free(tmp); + NSError *e = nil; id mdl = (__bridge id)k->model; id req = (__bridge id)k->request; ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); - float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + + // Read output, transpose [C,S] -> [S,C] IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + void *base_out = IOSurfaceGetBaseAddress(k->ioOut); + if (g_fp16_io) { + _Float16 *src = (_Float16*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = (float)src[c*sp + t]; + } else { + float *src = (float*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = src[c*sp + t]; + } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - for (int t = 0; t < sp; t++) - for (int c = 0; c < out_ch; c++) - out[t*out_ch + c] = tmp2[c*sp + t]; - free(tmp2); } // === Checkpoint: save/restore training state for exec() restart === @@ -173,6 +216,7 @@ static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ float lr; double cum_compile_ms, cum_train_ms, cum_wall_ms; int cum_steps, cum_batches; + int fp16_io; // persisted: 1 if ANE needs fp16 I/O (M1/M2) } CkptHeader; static void save_checkpoint(const char *path, int step, float loss, @@ -180,7 +224,7 @@ static void save_checkpoint(const char *path, int step, float loss, const float *W1, const float *W2, double cc, double ct, double cw, int cs, int cb) { FILE *f = fopen(path, "wb"); - CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb}; + CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb, g_fp16_io}; fwrite(&hdr, sizeof(hdr), 1, f); fwrite(W1, sizeof(float), H * D, f); fwrite(W2, sizeof(float), D * H, f); @@ -241,8 +285,9 @@ int main(int argc, char *argv[]) { start_step = hdr.step; total_steps = hdr.total_steps; lr = hdr.lr; + g_fp16_io = hdr.fp16_io; resuming = true; - printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss); + printf("[RESUMED at step %d, loss=%.6f, fp16_io=%d, compiles reset]\n", start_step, hdr.loss, g_fp16_io); } } diff --git a/training/tiny_train_old.m b/training/tiny_train_old.m index c22a90c..0eea1f4 100644 --- a/training/tiny_train_old.m +++ b/training/tiny_train_old.m @@ -59,34 +59,50 @@ static IOSurfaceRef make_surface(size_t bytes) { return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES]; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly + static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" + " tensor y = conv(dilations = dl, groups = gr, pad = pd, " + "pad_type = pt, strides = st, weight = W, x = x)[name = tensor(\"cv\")];\n" + " } -> (y);\n}\n", + in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp]; + } return [NSString stringWithFormat: - @"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, " - "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " - "{\"coremltools-version\", \"9.0\"}})]\n{\n" - " func main(tensor x) {\n" - " string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n" - " tensor x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n" - " tensor W = const()[name = string(\"W\"), " - "val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n" - " string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n" - " tensor st = const()[name = string(\"st\"), val = tensor([1, 1])];\n" - " tensor pd = const()[name = string(\"pd\"), val = tensor([0, 0, 0, 0])];\n" - " tensor dl = const()[name = string(\"dl\"), val = tensor([1, 1])];\n" - " int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n" + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor d1 = const()[name = tensor(\"d1\"), val = tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype = d1, x = x)[name = tensor(\"cx\")];\n" + " tensor W = const()[name = tensor(\"W\"), " + "val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(64)))];\n" + " tensor pt = const()[name = tensor(\"pt\"), val = tensor(\"valid\")];\n" + " tensor st = const()[name = tensor(\"st\"), val = tensor([1, 1])];\n" + " tensor pd = const()[name = tensor(\"pd\"), val = tensor([0, 0, 0, 0])];\n" + " tensor dl = const()[name = tensor(\"dl\"), val = tensor([1, 1])];\n" + " tensor gr = const()[name = tensor(\"gr\"), val = tensor(1)];\n" " tensor y16 = conv(dilations = dl, groups = gr, pad = pd, " - "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n" - " string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n" - " tensor y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n" + "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor(\"cv\")];\n" + " tensor d2 = const()[name = tensor(\"d2\"), val = tensor(\"fp32\")];\n" + " tensor y = cast(dtype = d2, x = y16)[name = tensor(\"co\")];\n" " } -> (y);\n}\n", in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp]; } typedef struct { - id model; + void *model; // CFBridgingRetain'd _ANEInMemoryModel IOSurfaceRef ioIn, ioOut; - id request; - NSString *tmpDir; + void *request; // CFBridgingRetain'd _ANERequest + void *tmpDir; // CFBridgingRetain'd NSString } Kern; static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) { @@ -103,9 +119,17 @@ static IOSurfaceRef make_surface(size_t bytes) { [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; NSError *e = nil; - if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL; + if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) { + if (!g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + return compile_kern_with_blob(blob, in_ch, out_ch, sp); + } + return NULL; + } if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL; - size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4; + size_t bpe = g_fp16_io ? 2 : 4; + size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe; IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB); id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI); id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO); @@ -113,40 +137,60 @@ static IOSurfaceRef make_surface(size_t bytes) { @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), @[wI], @[@0], @[wO], @[@0], nil, nil, @0); Kern *k = calloc(1, sizeof(Kern)); - k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td; + k->model = (void*)CFBridgingRetain(mdl); + k->ioIn = ioI; k->ioOut = ioO; + k->request = (void*)CFBridgingRetain(req); + k->tmpDir = (void*)CFBridgingRetain(td); return k; } static void free_kern(Kern *k) { if (!k) return; + id mdl = (__bridge id)k->model; NSError *e = nil; - ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e); + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e); CFRelease(k->ioIn); CFRelease(k->ioOut); - [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil]; + NSString *td = (__bridge id)k->tmpDir; + [[NSFileManager defaultManager] removeItemAtPath:td error:nil]; + CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir); free(k); } // ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) { - float *tmp = (float*)malloc(in_ch * sp * sizeof(float)); - for (int t = 0; t < sp; t++) - for (int c = 0; c < in_ch; c++) - tmp[c*sp + t] = in[t*in_ch + c]; IOSurfaceLock(k->ioIn, 0, NULL); - memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float)); + void *base_in = IOSurfaceGetBaseAddress(k->ioIn); + if (g_fp16_io) { + _Float16 *dst = (_Float16*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = (_Float16)in[t*in_ch + c]; + } else { + float *dst = (float*)base_in; + for (int t = 0; t < sp; t++) + for (int c = 0; c < in_ch; c++) + dst[c*sp + t] = in[t*in_ch + c]; + } IOSurfaceUnlock(k->ioIn, 0, NULL); - free(tmp); NSError *e = nil; + id mdl = (__bridge id)k->model; + id req = (__bridge id)k->request; ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( - k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e); - float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float)); + mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e); IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float)); + void *base_out = IOSurfaceGetBaseAddress(k->ioOut); + if (g_fp16_io) { + _Float16 *src = (_Float16*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = (float)src[c*sp + t]; + } else { + float *src = (float*)base_out; + for (int t = 0; t < sp; t++) + for (int c = 0; c < out_ch; c++) + out[t*out_ch + c] = src[c*sp + t]; + } IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL); - for (int t = 0; t < sp; t++) - for (int c = 0; c < out_ch; c++) - out[t*out_ch + c] = tmp2[c*sp + t]; - free(tmp2); } int main(int argc, char *argv[]) { From ef6dce373fb2ccda52fa2d973558bb8f3a1ca641 Mon Sep 17 00:00:00 2001 From: imperatormk Date: Mon, 2 Mar 2026 22:44:33 +0100 Subject: [PATCH 2/4] fix inmem_peak for M1/M2 --- inmem_peak.m | 61 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/inmem_peak.m b/inmem_peak.m index 87b8163..3334d01 100644 --- a/inmem_peak.m +++ b/inmem_peak.m @@ -8,6 +8,7 @@ static mach_timebase_info_data_t g_tb; static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; // M1/M2: cast op unsupported, use fp16 I/O directly NSData *buildWeightBlob(int ch, int depth) { NSUInteger wsize = ch * ch * 2; @@ -27,28 +28,45 @@ NSString *genMIL(int ch, int sp, int depth) { NSMutableString *m = [NSMutableString string]; - [m appendString:@"program(1.3)\n[buildInfo = dict({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"]; - [m appendFormat:@" func main(tensor x) {\n", ch, sp]; - [m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n" - @" tensor c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor([1, 1])];\n" - @" tensor c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" - @" tensor c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor([1, 1])];\n" - @" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n" - @" string x_to_fp16_dtype_0 = const()[name = string(\"x_to_fp16_dtype_0\"), val = string(\"fp16\")];\n"]; - [m appendFormat:@" tensor x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = string(\"cast_in\")];\n", ch, sp]; + [m appendString:@"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"]; + if (g_fp16_io) { + // fp16 I/O path — no cast ops (M1/M2 compatible) + [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + } else { + // fp32 I/O path — cast to/from fp16 internally (M4+ native) + [m appendFormat:@" func main(tensor x) {\n", ch, sp]; + } + [m appendString: + @" tensor c_pad_type_0 = const()[name = tensor(\"c_pad_type_0\"), val = tensor(\"valid\")];\n" + @" tensor c_strides_0 = const()[name = tensor(\"c_strides_0\"), val = tensor([1, 1])];\n" + @" tensor c_pad_0 = const()[name = tensor(\"c_pad_0\"), val = tensor([0, 0, 0, 0])];\n" + @" tensor c_dilations_0 = const()[name = tensor(\"c_dilations_0\"), val = tensor([1, 1])];\n" + @" tensor c_groups_0 = const()[name = tensor(\"c_groups_0\"), val = tensor(1)];\n"]; + NSString *prev; + if (g_fp16_io) { + prev = @"x"; + } else { + [m appendString:@" tensor x_to_fp16_dtype_0 = const()[name = tensor(\"x_to_fp16_dtype_0\"), val = tensor(\"fp16\")];\n"]; + [m appendFormat:@" tensor x_to_fp16 = cast(dtype = x_to_fp16_dtype_0, x = x)[name = tensor(\"cast_in\")];\n", ch, sp]; + prev = @"x_to_fp16"; + } NSUInteger cs = 64 + ch*ch*2; - NSString *prev = @"x_to_fp16"; for (int i = 0; i < depth; i++) { - [m appendFormat:@" tensor W%d = const()[name = string(\"W%d\"), val = tensor(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n", + [m appendFormat:@" tensor W%d = const()[name = tensor(\"W%d\"), val = tensor(BLOBFILE(path = tensor(\"@model_path/weights/weight.bin\"), offset = tensor(%lu)))];\n", ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)]; NSString *out = [NSString stringWithFormat:@"c%d", i]; - [m appendFormat:@" tensor %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n", + [m appendFormat:@" tensor %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = tensor(\"%@\")];\n", ch, sp, out, i, prev, out]; prev = out; } - [m appendString:@" string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"]; - [m appendFormat:@" tensor c = cast(dtype = to_fp32, x = %@)[name = string(\"cast_out\")];\n", ch, sp, prev]; - [m appendString:@" } -> (c);\n}\n"]; + if (g_fp16_io) { + [m appendFormat:@" tensor c = identity(x = %@)[name = tensor(\"out\")];\n", ch, sp, prev]; + [m appendString:@" } -> (c);\n}\n"]; + } else { + [m appendString:@" tensor to_fp32 = const()[name = tensor(\"to_fp32\"), val = tensor(\"fp32\")];\n"]; + [m appendFormat:@" tensor c = cast(dtype = to_fp32, x = %@)[name = tensor(\"cast_out\")];\n", ch, sp, prev]; + [m appendString:@" } -> (c);\n}\n"]; + } return m; } @@ -68,9 +86,18 @@ [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil]; [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; - if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -3;} + if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),21,@{},&e)){ + [fm removeItemAtPath:td error:nil]; + if (!g_fp16_io) { + printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n"); + g_fp16_io = 1; + return bench(ch, sp, depth); + } + return -3; + } if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),21,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;} - NSUInteger bytes=ch*sp*4; + size_t bpe = g_fp16_io ? 2 : 4; + NSUInteger bytes=ch*sp*bpe; IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0}); id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI); From 0cf13e2b8445219a609d75b6aac2811a8cdcc801 Mon Sep 17 00:00:00 2001 From: imperatormk Date: Tue, 3 Mar 2026 17:16:22 +0100 Subject: [PATCH 3/4] define g_fp16_io in train.m (fixes linker error) --- training/train.m | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/train.m b/training/train.m index 6fd4a86..6a70041 100644 --- a/training/train.m +++ b/training/train.m @@ -8,6 +8,8 @@ #include #include "backward.h" +int g_fp16_io = 0; // M1/M2: use fp16 I/O when cast op unsupported + static mach_timebase_info_data_t g_tb; static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } From 2d2adacf09ec6352cf8fea53c7a916bd5257144f Mon Sep 17 00:00:00 2001 From: imperatormk Date: Tue, 3 Mar 2026 18:26:12 +0100 Subject: [PATCH 4/4] wire up fp16 I/O retry in train.m forward path --- training/forward.h | 45 +++++++++++++++++++++++++++++++++------------ training/model.h | 45 +++++++++++++++++++++++++++++++++------------ 2 files changed, 66 insertions(+), 24 deletions(-) diff --git a/training/forward.h b/training/forward.h index adcf898..0f2ca9f 100644 --- a/training/forward.h +++ b/training/forward.h @@ -9,22 +9,43 @@ // Transpose back to [S, out_dim] row-major static void ane_conv_eval(ANEKernel *kernel, const float *x, float *y, int S, int in_dim, int out_dim) { - float *x_t = (float*)malloc(S * in_dim * sizeof(float)); - for (int t = 0; t < S; t++) - for (int i = 0; i < in_dim; i++) - x_t[i*S + t] = x[t*in_dim + i]; + if (g_fp16_io) { + // fp16 I/O path: transpose + convert float→fp16, write, eval, read fp16→float + transpose + _Float16 *x_t = (_Float16*)malloc(S * in_dim * sizeof(_Float16)); + for (int t = 0; t < S; t++) + for (int i = 0; i < in_dim; i++) + x_t[i*S + t] = (_Float16)x[t*in_dim + i]; - ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float)); - ane_eval(kernel); + ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(_Float16)); + ane_eval(kernel); - float *y_t = (float*)malloc(S * out_dim * sizeof(float)); - ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float)); + _Float16 *y_t = (_Float16*)malloc(S * out_dim * sizeof(_Float16)); + ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(_Float16)); - for (int t = 0; t < S; t++) - for (int i = 0; i < out_dim; i++) - y[t*out_dim + i] = y_t[i*S + t]; + for (int t = 0; t < S; t++) + for (int i = 0; i < out_dim; i++) + y[t*out_dim + i] = (float)y_t[i*S + t]; + + free(x_t); free(y_t); + } else { + // fp32 I/O path: transpose, write, eval, read, transpose back + float *x_t = (float*)malloc(S * in_dim * sizeof(float)); + for (int t = 0; t < S; t++) + for (int i = 0; i < in_dim; i++) + x_t[i*S + t] = x[t*in_dim + i]; + + ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float)); + ane_eval(kernel); + + float *y_t = (float*)malloc(S * out_dim * sizeof(float)); + ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float)); - free(x_t); free(y_t); + for (int t = 0; t < S; t++) + for (int i = 0; i < out_dim; i++) + y[t*out_dim + i] = y_t[i*S + t]; + + free(x_t); free(y_t); + } } // CPU matmul fallback: y = W @ x, W[out_dim, in_dim], x[S, in_dim] → y[S, out_dim] diff --git a/training/model.h b/training/model.h index 6cee52f..b8da703 100644 --- a/training/model.h +++ b/training/model.h @@ -151,8 +151,9 @@ static int model_load_weights(Model *m, const char *path) { static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) { NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch); NSString *mil = mil_gen_conv(in_ch, out_ch, spatial); - size_t inBytes = (size_t)in_ch * spatial * 4; - size_t outBytes = (size_t)out_ch * spatial * 4; + size_t bpe = g_fp16_io ? 2 : 4; + size_t inBytes = (size_t)in_ch * spatial * bpe; + size_t outBytes = (size_t)out_ch * spatial * bpe; return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes); } @@ -161,9 +162,31 @@ static int model_compile_kernels(Model *m, int seq_len) { m->seq_len = seq_len; int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size; int S = seq_len; - printf("Compiling %d ANE conv kernels (S=%d)...\n", N_LAYERS * 7 + 1, S); + printf("Compiling %d ANE conv kernels (S=%d, %s I/O)...\n", + N_LAYERS * 7 + 1, S, g_fp16_io ? "fp16" : "fp32"); - for (int l = 0; l < N_LAYERS; l++) { + // Try first layer as canary — if cast op fails, retry with fp16 I/O + m->kern_q[0] = compile_conv_kernel(m->wq[0], d, d, S); + if (!m->kern_q[0] && !g_fp16_io) { + printf(" Compile failed, retrying with fp16 I/O (M1/M2 fallback)...\n"); + g_fp16_io = 1; + m->kern_q[0] = compile_conv_kernel(m->wq[0], d, d, S); + } + if (!m->kern_q[0]) { fprintf(stderr, "L0 kern_q fail\n"); return -1; } + + m->kern_k[0] = compile_conv_kernel(m->wk[0], d, d, S); + m->kern_v[0] = compile_conv_kernel(m->wv[0], d, d, S); + m->kern_o[0] = compile_conv_kernel(m->wo[0], d, d, S); + m->kern_w1[0] = compile_conv_kernel(m->w1[0], d, hd, S); + m->kern_w2[0] = compile_conv_kernel(m->w2[0], hd, d, S); + m->kern_w3[0] = compile_conv_kernel(m->w3[0], d, hd, S); + if (!m->kern_k[0] || !m->kern_v[0] || !m->kern_o[0] || + !m->kern_w1[0] || !m->kern_w2[0] || !m->kern_w3[0]) { + fprintf(stderr, "L0 compile fail\n"); return -1; + } + printf(" Layer 0 OK\n"); + + for (int l = 1; l < N_LAYERS; l++) { m->kern_q[l] = compile_conv_kernel(m->wq[l], d, d, S); m->kern_k[l] = compile_conv_kernel(m->wk[l], d, d, S); m->kern_v[l] = compile_conv_kernel(m->wv[l], d, d, S); @@ -171,20 +194,18 @@ static int model_compile_kernels(Model *m, int seq_len) { m->kern_w1[l] = compile_conv_kernel(m->w1[l], d, hd, S); m->kern_w2[l] = compile_conv_kernel(m->w2[l], hd, d, S); m->kern_w3[l] = compile_conv_kernel(m->w3[l], d, hd, S); - if (!m->kern_q[l]) { fprintf(stderr, "L%d kern_q fail\n",l); return -1; } - if (!m->kern_k[l]) { fprintf(stderr, "L%d kern_k fail\n",l); return -1; } - if (!m->kern_v[l]) { fprintf(stderr, "L%d kern_v fail\n",l); return -1; } - if (!m->kern_o[l]) { fprintf(stderr, "L%d kern_o fail\n",l); return -1; } - if (!m->kern_w1[l]) { fprintf(stderr, "L%d kern_w1 fail\n",l); return -1; } - if (!m->kern_w2[l]) { fprintf(stderr, "L%d kern_w2 fail\n",l); return -1; } - if (!m->kern_w3[l]) { fprintf(stderr, "L%d kern_w3 fail\n",l); return -1; } + if (!m->kern_q[l] || !m->kern_k[l] || !m->kern_v[l] || !m->kern_o[l] || + !m->kern_w1[l] || !m->kern_w2[l] || !m->kern_w3[l]) { + fprintf(stderr, "L%d compile fail\n", l); return -1; + } printf(" Layer %d OK\n", l); } m->kern_cls = compile_conv_kernel(m->wcls, d, vs, S); if (!m->kern_cls) { fprintf(stderr, "Classifier kernel compile failed (dim=%d→vocab=%d too large?), using CPU for cls\n", d, vs); } - printf(" All kernels compiled (%d conv + %s)\n", N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU"); + printf(" All kernels compiled (%d conv + %s, %s I/O)\n", + N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU", g_fp16_io ? "fp16" : "fp32"); return 0; }