diff --git a/docs/ANE_CHAINING_RESEARCH.md b/docs/ANE_CHAINING_RESEARCH.md new file mode 100644 index 0000000..9457613 --- /dev/null +++ b/docs/ANE_CHAINING_RESEARCH.md @@ -0,0 +1,1111 @@ +# ANE ChainingRequest API Research + +Research into Apple Neural Engine private APIs for multi-kernel pipelining, conducted on M4 Max / macOS 15. + +**Goal**: Eliminate CPU round-trips between ANE layer evaluations. In a 12-layer model, sequential evaluation requires 23+ CPU-ANE round-trips per token. The `_ANEChainingRequest` API appears designed to let the ANE run operations back-to-back in a hardware pipeline, keeping data on-chip. + +**Status**: ChainingRequest validates and `prepareChainingWithModel:` no longer crashes (crash fix: pass nil for symbol/procedure params). Blocked on Code=15 (`ANEProgramChainingPrepare Failed`) -- the `_ANEModel` needs Espresso IR format (not MIL) for full symbol table population. At production dims (768x256), sequential ANE dispatch costs ~0.2ms/kernel; chaining would save ~23 round-trips per token. + +See also: [ANE_INTERNALS.md](ANE_INTERNALS.md) for comprehensive ANE documentation including compilation pipeline, hardware specs, and community research references. + +--- + +## Test Files + +| File | Purpose | +|------|---------| +| `training/test_chaining.m` | v1 prototype: sequential baseline + ChainingRequest creation | +| `training/test_chaining_v2.m` | v2 deep exploration: 6-phase probe of 12+ private classes | +| `training/test_ane_model.m` | Experiments E-P: _ANEModel loading, compiler, chaining, fences, type encoding, mapping | +| `training/test_throughput_ceiling.m` | Experiment I: 12-kernel throughput ceiling benchmark | + +Build and run: +```bash +cd training +make test_chaining && ./test_chaining +make test_chaining_v2 && ./test_chaining_v2 +make test_ane_model && ./test_ane_model +make test_throughput_ceiling && ./test_throughput_ceiling +``` + +--- + +## 1. Executive Summary + +### What works + +| Finding | Impact | Status | +|---------|--------|--------| +| `evaluateRealTimeWithModel:` via `_ANEClient` | 1.88x faster on small kernels (64x32); **no benefit at production dims** (768x256) | Benchmarked | +| `processRequest` via `_ANEProgramForEvaluation` | 1.34x faster on small kernels; marginal at production dims | Benchmarked | +| `_ANEBuffer` wraps IOSurface with `symbolIndex` | Solves input indexing for chaining | Proven | +| All 9 unexplored ANE classes exist on M4 Max | Full API surfaces documented | Documented | + +> **Important**: The RT execution speedup (1.88x) observed in isolated testing on 64x32 convolution kernels does **not** generalize to production dimensions. At 768x256 (Stories110M size), all four execution paths converge to ~0.2 ms per kernel. See [Production Dimension Results](#production-dimension-results-test_bench_pathsm-m4-max) below. + +### What's been solved + +| Finding | Status | Detail | +|---------|--------|--------| +| `_ANEIOSurfaceOutputSets` works with 64-byte statsSurRef | **SOLVED** | Any non-NULL IOSurface works as stats buffer | +| `_ANEChainingRequest.validate` returns YES | **SOLVED** | With proper `_ANEBuffer` inputs + `_ANEIOSurfaceOutputSets` outputs | +| `processRequest` via `_ANEProgramForEvaluation` | **1.34x faster** | Lower-level eval (0.131 ms vs 0.175 ms) | +| ChainingRequest factory crash (`[NSConstantIntegerNumber count]`) | **SOLVED** | Pass `nil` for `lbInputSymbolId`, `lbOutputSymbolId`, `procedureIndex` | +| `_ANEModel` loading from temp directory | **SOLVED** | `modelAtURL:key:` with tmpDir URL + hexStringIdentifier | +| `_ANESharedSignalEvent` / `_ANESharedWaitEvent` | **SOLVED** | Use `MTLSharedEvent` or `IOSurfaceSharedEventCreate()` | +| ChainingRequest type encodings | **DOCUMENTED** | All 9 factory params are `@` (object). `prepare` has 5 params (3x`@`, 1x`I` qos, 1x`^@` err) | + +### What's still blocked + +| Blocker | Root Cause | +|---------|------------| +| `prepareChainingWithModel:` returns Code=15 | `ANEProgramChainingPrepare() Failed` -- model not recognized as chaining-capable | +| `_ANEModel` has empty symbol table | MIL-compiled model shell lacks Espresso IR data (`model.espresso.net`) | +| `_ANEClient.loadModel:` / `compileModel:` fail | Require Espresso IR format, not MIL | +| `_ANEProgramIOSurfacesMapper` returns NO | Needs fully loaded model with symbol table | +| `_ANEPerformanceStats` with `_ANERequest` | Request expects `statType` selector on perfStats objects | + +--- + +## 2. ANE Private API Class Map + +### Core Classes (known working) + +**`_ANEInMemoryModel`** -- the model object for in-memory MIL compilation. +- `+inMemoryModelWithDescriptor:` -- create from `_ANEInMemoryModelDescriptor` +- `-compileWithQoS:options:error:` -- compile MIL to ANE binary +- `-loadWithQoS:options:error:` -- load compiled model onto ANE +- `-evaluateWithQoS:options:request:error:` -- standard evaluation (QoS 0-63, 21 default) +- `-unloadWithQoS:error:` -- unload from ANE +- Properties: `hexStringIdentifier`, `programHandle` (uint64), `program` (`_ANEProgramForEvaluation`), `perfStatsMask` +- Missing: `inputSymbolNames`, `outputSymbolNames`, `inputSymbolIndicesForProcedureIndex:` + +**`_ANEInMemoryModelDescriptor`** -- model specification. +- `+modelWithMILText:weights:optionsPlist:` -- create descriptor from MIL NSData + weight dict + +**`_ANERequest`** -- evaluation request packaging I/O surfaces. +- `+requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:` +- `perfStats` parameter expects `NSArray` of stat info objects (not `_ANEPerformanceStats`) + +**`_ANEIOSurfaceObject`** -- thin wrapper around `IOSurfaceRef`. +- `+objectWithIOSurface:` -- wrap a raw IOSurface +- Does NOT have `symbolIndex` property (this is the v1 blocker) + +**`_ANEClient`** -- client connection to the ANE daemon. +- `+sharedConnection` -- singleton accessor +- `-evaluateWithModel:options:request:qos:error:` -- 5-param eval via client +- `-evaluateRealTimeWithModel:options:request:error:` -- **RT priority eval (1.7x faster)** +- `-doEvaluateDirectWithModel:options:request:qos:error:` -- direct eval bypass +- `-beginRealTimeTask` / `-endRealTimeTask` -- RT task bracketing (returns NO, but RT eval still works) +- `-prepareChainingWithModel:options:chainingReq:qos:error:` -- chaining setup +- `-enqueueSetsWithModel:outputSet:options:qos:error:` -- chaining output enqueue +- `-buffersReadyWithModel:inputBuffers:options:qos:error:` -- chaining input signal + +### Discovered Classes (v2 exploration) + +**`_ANEBuffer`** -- wraps `_ANEIOSurfaceObject` with index metadata. **Key discovery.** +- `+bufferWithIOSurfaceObject:symbolIndex:source:` -- factory + - `ioSurfaceObject`: an `_ANEIOSurfaceObject` (NOT raw `IOSurfaceRef`) + - `symbolIndex`: `NSNumber` mapping to compiled model I/O symbol + - `source`: `long long` -- 0=ANE, 1=output, 2=unknown +- Properties: `ioSurfaceObject`, `symbolIndex`, `source` +- Description format: `"_ANEBuffer: { ioSurface=0x... ; symbolIndex=0 ; ANEBufferProducerAgent=0 }"` + +**`_ANEProgramIOSurfacesMapper`** -- maps IOSurfaces to compiled model symbols. +- `+mapperWithProgramHandle:(uint64_t)handle` -- works, creates mapper +- `+mapperWithController:(id)ctrl` -- alternative factory +- `-mapIOSurfacesWithModel:request:cacheInference:error:` -- **FAILS** on `_ANEInMemoryModel` (calls `inputSymbolIndicesForProcedureIndex:` which doesn't exist) +- `-validateRequest:model:` -- also fails for same reason +- Implication: designed for `_ANEModel` (disk-based compiled models), not in-memory MIL + +**`_ANEProgramForEvaluation`** -- lower-level evaluation program. +- Accessible via `model.program` property +- `+programWithHandle:intermediateBufferHandle:queueDepth:` -- factory +- `-processRequest:model:qos:qIndex:modelStringID:options:returnValue:error:` -- low-level eval + +**`_ANEIOSurfaceOutputSets`** -- output set packaging for chaining. +- `+objectWithstatsSurRef:outputBuffer:` -- factory + - `statsSurRef`: `IOSurfaceRef` for perf stats collection -- **returns nil when NULL** + - `outputBuffer`: `NSArray` of `_ANEBuffer` objects +- This is the current blocker: we don't know the correct stats IOSurface format + +**`_ANEInputBuffersReady`** -- input signaling for chaining pipeline. +- `+inputBuffersWithProcedureIndex:inputBufferInfoIndex:inputFreeValue:executionDelay:` +- Parameters: procedure index, buffer info indices, free values, execution delay +- This is the mechanism that tells the ANE "inputs are ready, start processing" + +**`_ANEOutputSetEnqueue`** -- output pipeline configuration for chaining. +- `+outputSetWithProcedureIndex:setIndex:signalValue:signalNotRequired:isOpenLoop:` +- Configures output set enqueue behavior with signal values and open-loop mode + +**`_ANEChainingRequest`** -- the chaining request itself. +- `+chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:` +- `-validate` -- returns YES/NO +- Expects `inputs` as `_ANEBuffer` objects, `outputSets` as `_ANEIOSurfaceOutputSets` objects + +**`_ANEModelInstanceParameters`** -- model instance configuration. +- Alloc/init produces a valid object +- API surface dumped but not yet exercised + +**`_ANEDeviceController`** -- device-level controller. +- `+controllerWithProgramHandle:` -- attempted but returned nil in our tests + +**`_ANEQoSMapper`** -- QoS level mapping. +- API surface dumped, not yet exercised + +**`_ANEPerformanceStats`** -- performance statistics. +- `+statsWithHardwareExecutionNS:(uint64_t)ns` -- factory +- Properties: `hwExecutionTime`, `performanceCounters` +- Cannot be used with `_ANERequest.perfStats` (expects array of objects with `statType` selector) +- Setting `perfStatsMask=0xFF` on model works but `performanceCounters` returns nil + +**`_ANESharedSignalEvent` / `_ANESharedWaitEvent`** -- hardware sync primitives (not yet explored). +- Likely the fence mechanism for GPU-ANE or multi-model synchronization +- Referenced in `_ANEChainingRequest.signalEvents` parameter + +--- + +## 3. Experiment Logs + +### v1: test_chaining.m Results (M4 Max) + +``` +=== ANE ChainingRequest Prototype === + +All required classes found. + +--- Phase 1: Compile two identical conv kernels --- + Kernel 1: compiled and loaded + Kernel 2: compiled and loaded + +--- Phase 2: Baseline (sequential eval) --- + Sequential: 10.355 ms total (0.207 ms/pair) + Output[0..3]: [0.2500, 0.2500, 0.2500, 0.2500] + +--- Phase 3: _ANEChainingRequest exploration --- + _ANEClient: obtained + ChainingRequest created: _ANEChainingRequest: { inputBuffer=( + "_ANEIOSurfaceObject: { ioSurface=0x... ; startOffset=0 }" + ) ; outputSets=( ... ) } + validate: NO + +--- Phase 4: Loopback ChainingRequest --- + ChainingRequest created (loopback) + validate: NO + prepareChainingWithModel: EXCEPTION (validate fails first) + +--- Summary --- + Sequential baseline: 0.207 ms/pair (two evals + memcpy) + ChainingRequest: creates but validate FAILS + Root cause: _ANEIOSurfaceObject lacks symbolIndex property + Next: explore _ANEBuffer and _ANEProgramIOSurfacesMapper +``` + +### v2: test_chaining_v2.m Results (M4 Max) + +**Phase 1: Class Introspection** +- 9 classes found, 0 missing +- All classes exist on M4 Max / macOS 15 +- Full method lists, properties, and type encodings dumped for each + +**Phase 2: Symbol Name Discovery** +- `inputSymbolNames`: NOT available on `_ANEInMemoryModel` +- `outputSymbolNames`: NOT available on `_ANEInMemoryModel` +- `programHandle`: YES (uint64 handle to compiled program) +- `_ANEIOSurfaceObject` does NOT have `symbolIndex` getter or setter +- `+objectWithIOSurface:symbolIndex:` class method NOT available + +**Phase 3: IOSurface Mapper & Buffer Experiments** + +3a: `_ANEProgramIOSurfacesMapper` +``` + mapperWithProgramHandle(12345): created successfully + mapIOSurfacesWithModel: EXCEPTION + -[_ANEInMemoryModel inputSymbolIndicesForProcedureIndex:]: + unrecognized selector + validateRequest:model: EXCEPTION (same reason) +``` + +3b: `_ANEBuffer` -- **success** +``` + bufferWithIOSurfaceObject(symIdx=0, source=0): + _ANEBuffer: { ioSurface=0x... ; symbolIndex=0 ; ANEBufferProducerAgent=0 } + bufferWithIOSurfaceObject(symIdx=0, source=1): + _ANEBuffer: { ioSurface=0x... ; symbolIndex=0 ; ANEBufferProducerAgent=1 } + bufferWithIOSurfaceObject(symIdx=0, source=2): + _ANEBuffer: { ioSurface=0x... ; symbolIndex=0 ; ANEBufferProducerAgent=2 } + bufferWithIOSurfaceObject(symIdx=1, source=0): + _ANEBuffer: { ioSurface=0x... ; symbolIndex=1 ; ANEBufferProducerAgent=0 } + symbolIndex property: accessible and correct +``` + +3c: `_ANEIOSurfaceObject` symbolIndex experiments +``` + setSymbolIndex: NOT available on _ANEIOSurfaceObject + symbolIndex getter: NOT available + +objectWithIOSurface:symbolIndex: NOT available +``` + +3d: IOSurface property experiments +``` + IOSurface 'symbolIndex' property (set via IOSurfaceSetValue): 0 + _ANEIOSurfaceObject.symbolIndex after property set: + (IOSurface user properties do NOT propagate to _ANEIOSurfaceObject) +``` + +3e: `_ANEProgramForEvaluation` +``` + k1.model.program: <_ANEProgramForEvaluation: 0x...> + (accessible via model.program property) +``` + +**Phase 4: ChainingRequest Retry** + +4a: Sequential baseline +``` + Sequential: 0.259 ms/pair (50 iters) + Output[0..3]: [0.2500, 0.2500, 0.2500, 0.2500] +``` + +Attempts 1-4: Various raw IOSurface configurations +``` + [Attempt 1] Standard (raw IOSurfaceObject): CRASH + -[_ANEIOSurfaceObject symbolIndex]: unrecognized selector + [Attempt 2] IOSurface with symbolIndex property: CRASH (same) + [Attempt 3] Two-model loopback: CRASH (same) + [Attempt 4] Skip validate, call prepareChainingWithModel directly: CRASH (same) +``` + +Attempt 5: `_ANEBuffer` + `_ANEIOSurfaceOutputSets` +``` + bufIn: _ANEBuffer: { ... symbolIndex=0 ; ANEBufferProducerAgent=0 } + bufOut: _ANEBuffer: { ... symbolIndex=0 ; ANEBufferProducerAgent=1 } + outputSet (objectWithstatsSurRef:NULL outputBuffer:@[bufOut]): nil + -> _ANEIOSurfaceOutputSets returns nil when statsSurRef is NULL +``` + +Attempt 6: `_ANEClient.evaluateWithModel:` -- **works** +``` + evaluateWithModel (via client): YES +``` + +Attempt 7: `_ANEClient.doEvaluateDirectWithModel:` -- **works** +``` + doEvaluateDirectWithModel: YES +``` + +**Phase 5: Alternative Execution Paths** + +5a: Real-time eval -- **1.7x speedup** +``` + beginRealTimeTask: NO (possibly needs entitlement) + evaluateRealTimeWithModel: YES + + RT eval: 0.090 ms/eval avg (50 iters) + Standard eval: 0.157 ms/eval avg (50 iters) + RT vs Standard speedup: 1.74x + + endRealTimeTask: NO +``` + +5b: PerfStats +``` + perfStatsMask = 0x01..0x80: set OK (all masks accepted) + statsWithHardwareExecutionNS:0 = <_ANEPerformanceStats> + Eval with @[perfStats]: OK (no crash when wrapped in array) + hwExecutionTime after eval: nil + Eval with mask=0xFF, perfStats=nil: OK + performanceCounters: nil +``` + +--- + +## 4. Evaluation Path Benchmarks + +Measured on 64x32 convolution kernels, M4 Max, 200 iterations after 10 warmup: + +| Method | Latency | Speedup | API | +|--------|---------|---------|-----| +| `evaluateWithQoS:` (standard) | 0.175 ms | 1.0x | `model.evaluateWithQoS:options:request:error:` | +| `evaluateRealTimeWithModel:` | 0.093 ms | **1.88x** | `client.evaluateRealTimeWithModel:options:request:error:` | +| `processRequest` | 0.131 ms | **1.34x** | `program.processRequest:model:qos:qIndex:modelStringID:options:returnValue:error:` | +| `doEvaluateDirectWithModel:` | 0.225 ms | 0.78x | `client.doEvaluateDirectWithModel:options:request:qos:error:` | + +Key observations (small kernel, isolated): +- RT eval was fastest in isolated test (1.88x speedup on 64x32) +- `processRequest` was faster than standard but slower than RT +- `doEvaluateDirectWithModel` was actually **slower** than standard (0.78x) +- `beginRealTimeTask` returning NO does not prevent `evaluateRealTimeWithModel:` from working + +### Production Dimension Results (test_bench_paths.m, M4 Max) + +At realistic kernel sizes with multiple compiled models, the picture changes: + +| Config | Standard | RT | processRequest | ane_eval_rt | +|--------|----------|-----|----------------|-------------| +| 64x32 (test) | 0.109 ms | 0.233 ms (0.5x) | 0.156 ms (0.7x) | 0.195 ms (0.6x) | +| 128x64 | 0.208 ms | 0.184 ms (1.1x) | 0.201 ms (1.0x) | 0.185 ms (1.1x) | +| 256x64 | 0.197 ms | 0.212 ms (0.9x) | 0.203 ms (1.0x) | 0.157 ms (1.3x) | +| 512x64 | 0.120 ms | 0.147 ms (0.8x) | 0.194 ms (0.6x) | 0.179 ms (0.7x) | +| 768x256 (prod) | 0.205 ms | 0.246 ms (0.8x) | 0.185 ms (1.1x) | 0.291 ms (0.7x) | + +**Key finding**: The RT eval speedup observed in isolated testing (1.88x) does not hold at production dimensions. At 768x256 (Stories110M size), all eval paths perform similarly (~0.2 ms), with standard eval being competitive or fastest. The overhead of the client-based paths (RT, direct) outweighs any ANE scheduling benefit at scale. + +--- + +## 5. Remaining Blockers and Next Steps + +### SOLVED: _ANEIOSurfaceOutputSets statsSurRef + +The chaining pipeline requires: +1. Inputs as `_ANEBuffer` objects with `symbolIndex` -- **SOLVED** +2. OutputSets as `_ANEIOSurfaceOutputSets` objects -- **SOLVED** + +A 64-byte IOSurface as `statsSurRef` is sufficient. `_ANEChainingRequest.validate` returns YES with this setup. + +### SOLVED: ChainingRequest parameter type mismatch (Experiment K-L) + +The `[NSConstantIntegerNumber count]` crash was caused by passing `NSNumber` values for `lbInputSymbolId`, `lbOutputSymbolId`, and `procedureIndex`. Type encoding analysis (Experiment K) revealed all 9 factory parameters are `@` (id/object), but the factory internally calls `count` on them, expecting arrays or nil. + +**Fix**: Pass `nil` for `lbInputSymbolId`, `lbOutputSymbolId`, and `procedureIndex`: +```objc +chainingRequestWithInputs:@[buf] outputSets:@[outSet] + lbInputSymbolId:nil lbOutputSymbolId:nil procedureIndex:nil + signalEvents:@[] transactionHandle:@0 fwEnqueueDelay:@0 memoryPoolId:@0 +``` +This produces a valid `_ANEChainingRequest` (`validate` returns YES) and `prepareChainingWithModel:` no longer crashes. + +### Current Blocker: ANEProgramChainingPrepare() Failed (Code=15) + +`prepareChainingWithModel:` now returns NO with error: +``` +Error Domain=com.apple.appleneuralengine Code=15 +"ANEProgramChainingPrepare() Failed: Program chaining prepare error" +``` + +This error occurs with all three model types tested: +- Fresh `_ANEModel` (state=1, populated with programHandle+program) +- Populated `_ANEModel` from Experiment E (state=5 after failed loadModel/compileModel) +- `_ANEInMemoryModel` still crashes on `getUUID` (cannot be used with chaining at all) + +The `Code=15` error is a **logical failure** in the ANE daemon's chaining preparation, not a crash. The model is not fully recognized as "chaining-capable" by the daemon, likely because: +1. The `_ANEModel` was populated by copying `programHandle`/`program` from an `_ANEInMemoryModel`, not loaded through the standard CoreML/Espresso pipeline +2. Symbol indices remain empty (the daemon may require them for chaining buffer routing) +3. The model needs `model.espresso.net` format (not MIL) for `_ANEClient.loadModel:` / `compileModel:` + +**Previous blocker (SOLVED)**: `[NSConstantIntegerNumber count]` crash -- fixed by passing `nil` for symbol/procedure params. + +### Experiments E-H Results (test_ane_model.m) + +#### Experiment E: _ANEModel Loading -- SOLVED + +`_ANEModel.modelAtURL:key:` works with the compiled temp directory URL and `hexStringIdentifier` as key: +``` +diskModel = _ANEModel.modelAtURL:key:(tmpDirURL, hexId) + -> _ANEModel with UUID, getUUID works + -> state=1, program=nil, programHandle=0 (shell only) +``` + +Populating the shell with `_ANEInMemoryModel` data: +``` +diskModel.setProgramHandle:(inMemoryModel.programHandle) -> success +diskModel.setProgram:(inMemoryModel.program) -> success +``` + +After population, `programHandle` and `program` are set, but `inputSymbolIndicesForProcedureIndex:0` still returns empty `NSIndexSet`. The symbol table data isn't stored in the `_ANEProgramForEvaluation` -- it's likely in the `model.hwx` or `net.plist` that the standard CoreML path generates. + +#### Experiment E2: ANECompiler -- No ObjC API + +- `ANECompiler.framework` exists at `/System/Library/PrivateFrameworks/ANECompiler.framework/` but contains **no ObjC classes** -- it's a pure C library (`ANECCompile()` is the entry point, called internally by `_ANEInMemoryModel.compileWithQoS:`) +- `debug_mask` option had no visible effect on compilation output +- No `ane_compiler_service` found at standard paths +- Key `_ANEInMemoryModel` compilation methods found: `saveModelFiles`, `localModelPath`, `compiledModelExists`, `mapIOSurfacesWithRequest:cacheInference:error:` + +#### Experiment F: Chaining Pipeline -- Blocked + +With populated `_ANEModel` (has UUID + programHandle + program), `prepareChainingWithModel:` still crashes on `[NSConstantIntegerNumber count]`. The crash is in the `_ANEChainingRequest` parameter handling, not in the model itself. + +#### Experiment G: Hardware Fences -- FULLY SOLVED + +Both `_ANESharedSignalEvent` and `_ANESharedWaitEvent` now work: + +```objc +// MTLSharedEvent via Metal (works) +id device = MTLCreateSystemDefaultDevice(); +id sharedEvent = [device newSharedEvent]; + +// IOSurfaceSharedEvent via IOKit (also works) +id iosEvent = IOSurfaceSharedEventCreate(); + +// Signal event factory: (uint64_t value, unsigned int symbolIndex, long long eventType, id sharedEvent) +_ANESharedSignalEvent.signalEventWithValue:symbolIndex:eventType:sharedEvent: + -> works with both MTLSharedEvent and IOSurfaceSharedEvent + +// Wait event factory: (uint64_t value, id sharedEvent) +_ANESharedWaitEvent.waitEventWithValue:sharedEvent: + -> works with both event types +``` + +Event types 0, 1, 2 all produce valid signal events. The `eventType` property is correctly set. + +#### Experiment H: Alternative Preparation -- Same Crash + +`doPrepareChainingWithModel:options:chainingReq:qos:error:` exists with identical signature and crashes identically. Full `_ANEClient` API (46 instance methods) documented in test output. + +### Throughput Ceiling (test_throughput_ceiling.m, Experiment I) + +12-kernel pipeline benchmarks on M4 Max: + +| Config | Sequential (run+memcpy) | Run-only | Memcpy-only | GCD Serial | +|--------|------------------------|----------|-------------|------------| +| 64x32 (test) | 0.272 ms/kernel | 0.158 ms/kernel | 0.001 ms/copy | 0.200 ms/kernel | +| 256x64 (small) | 0.191 ms/kernel | 0.181 ms/kernel | 0.002 ms/copy | 0.176 ms/kernel | +| 768x256 (prod) | 0.177 ms/kernel | 0.226 ms/kernel | 0.006 ms/copy | 0.186 ms/kernel | + +**Key findings**: +- **Memcpy overhead is negligible** (<0.01 ms per copy even at 393KB). Not the bottleneck. +- **CPU round-trip overhead** is in the ANE dispatch itself, not data movement. +- At production dims, sequential with memcpy is actually *faster* than eval-only (pipeline caching effect). +- **GCD serial queue** provides modest improvement at small dims but marginal at production. +- **Chaining's value** would be eliminating the ~0.2ms/kernel ANE dispatch overhead, not memcpy. With 12 kernels, total pipeline takes ~2.1ms (prod), so eliminating dispatch could potentially halve this. + +### Experiments K-P Results (test_ane_model.m, 2026-03-04) + +#### Experiment K: Type Encoding Analysis -- COMPLETE + +Full type encodings for all chaining-related methods: + +| Method | Encoding | Notes | +|--------|----------|-------| +| `chainingRequestWithInputs:...` | `@88@0:8@16@24@32@40@48@56@64@72@80` | All 9 params are `@` (id/object) | +| `prepareChainingWithModel:...` | `B52@0:8@16@24@32I40^@44` | 5 params: 3x `@`, 1x `I` (uint32 qos), 1x `^@` (error ptr) | +| `doPrepareChainingWithModel:...` | `B52@0:8@16@24@32I40^@44` | Same signature as prepareChainingWithModel | + +The `_ANEChainingRequest` factory takes 9 object parameters. The `lbInputSymbolId`, `lbOutputSymbolId`, and `procedureIndex` are all `@` (object), not raw integers. Internally, the factory calls `unsignedIntegerValue` (from NSNumber) or `count` (from NSArray) on these parameters. + +| `_ANEChainingRequest` Property | Encoding | Type | +|-------------------------------|----------|------| +| `procedureIndex` | `@` | id (nil or NSArray) | +| `loopbackInputSymbolIndex` | `@` | id (nil or NSArray) | +| `loopbackOutputSymbolIndex` | `@` | id (nil or NSArray) | + +#### Experiment L: Array-Typed Parameters -- BREAKTHROUGH + +| Combo | lbIn | lbOut | procIdx | Factory | Validate | Prepare | +|-------|------|-------|---------|---------|----------|---------| +| L.1: Arrays `@[@(-1)]` | `@[@(-1)]` | `@[@(-1)]` | `@[@0]` | CRASH: `unsignedIntegerValue` on NSArray | - | - | +| L.2: Arrays `@[@0]` | `@[@0]` | `@[@0]` | `@[@0]` | CRASH: `unsignedIntegerValue` on NSArray | - | - | +| L.3: Empty `@[]` | `@[]` | `@[]` | `@[]` | CRASH: `unsignedIntegerValue` on empty array | - | - | +| **L.4: nil** | **nil** | **nil** | **nil** | **OK** | **YES** | **NO (Code=15)** | +| L.5: NSNumber | `@(-1)` | `@(-1)` | `@0` | CRASH: `count` on NSNumber | - | - | + +**Passing `nil` for all three symbol/procedure params gets past both the factory crash and the `prepareChainingWithModel` crash.** The `validate` returns YES and `prepareChainingWithModel:` returns a clean error (Code=15: `ANEProgramChainingPrepare() Failed`) instead of crashing. + +#### Experiment M: Load Model via _ANEClient -- BLOCKED + +Both `loadModel:` and `compileModel:` on `_ANEClient` require **Espresso IR** format (`model.espresso.net`), not MIL: +``` +Error Domain=com.apple.appleneuralengine.espresso Code=-1 +"_ANEEspressoIRTranslator : error Cannot load network '.../model.espresso.net'" +``` + +`compiledModelExistsFor:` returns NO for our MIL-compiled model. After the failed load/compile attempts, the `_ANEModel` state changes from 1 to 5 (error/invalid state). + +The standard CoreML pipeline generates `model.espresso.net` (Espresso IR) and `model.espresso.weights` from the `.mlpackage` / `.mlmodelc` format. Our MIL-only path bypasses this, so we can't use `_ANEClient.loadModel:` without first generating the Espresso IR. + +#### Experiment N: IOSurface Mapping -- PARTIAL + +`_ANEProgramIOSurfacesMapper`: +- `mapperWithProgramHandle:` creates a valid mapper from the `_ANEInMemoryModel` programHandle +- `mapIOSurfacesWithModel:request:cacheInference:error:` returns NO (no exception, no error output) +- `validateRequest:model:` returns NO +- `_ANEModel.mapper` property is nil +- `prepareANEMemoryMappingParams:request:` revealed `ANEMemoryMappingParamsStruct` has 128 `ANEBufferStruct` slots: `[128{ANEBufferStruct=^{__IOSurface}IiiI}]` + +The mapper appears to need a fully loaded model with symbol table data that our MIL-compiled shell doesn't have. + +#### Experiment O: Procedure Info -- EMPTY + +- `procedureInfoForProcedureIndex:0` returns **nil** on the populated `_ANEModel` +- `procedureCount` is not a method or KVC-accessible property +- `modelAttributes` returns empty dictionary `{}` +- `inputSymbolNames` / `outputSymbolNames` not available on `_ANEModel` +- The `symbolIndicesForProcedureIndex:indexArrayKey:` method exists (takes `I` + `@`) but symbol data is empty + +#### Experiment P: Full Chaining Retry -- Code=15 + +Tested with three model types, all using nil for symbol params: + +| Model | State | validate | prepare Result | +|-------|-------|----------|---------------| +| Fresh `_ANEModel` (state=1, populated) | 1 | YES | NO (Code=15) | +| `_ANEInMemoryModel` | 3 | YES | CRASH: `getUUID` | +| Populated `_ANEModel` (from E, state=5) | 5 | YES | NO (Code=15) | + +Also documented `_ANEInputBuffersReady` and `_ANEOutputSetEnqueue` type signatures: + +| Class | Factory | Param Types | +|-------|---------|-------------| +| `_ANEInputBuffersReady` | `inputBuffersWithProcedureIndex:inputBufferInfoIndex:inputFreeValue:executionDelay:` | `I` (uint32), `@` (NSArray), `@` (NSArray), `Q` (uint64) | +| `_ANEOutputSetEnqueue` | `outputSetWithProcedureIndex:setIndex:signalValue:signalNotRequired:isOpenLoop:` | `I`, `I`, `Q`, `B`, `B` | + +### Experiments Q-S Results (test_coreml_chaining.m, 2026-03-04) + +#### Experiment Q: CoreML Pipeline -- MAJOR DISCOVERY + +**The E5 runtime (macOS 15+) does NOT use `_ANEModel` or `_ANEChainingRequest` at all.** + +CoreML on macOS 15 uses the MIL-based "E5" runtime, which completely bypasses the older Espresso/`_ANEModel`/`_ANEChainingRequest` path: + +| Component | Old Path (Espresso) | New Path (E5/MIL) | +|-----------|--------------------|--------------------| +| Model format | `.espresso.net` + `.espresso.weights` | `model.mil` + `weights/weight.bin` | +| Model class | `_ANEModel` | `e5rt_program_library` (C struct) | +| Engine | `_ANEClient` + `_ANERequest` | `MLE5Engine` + `MLE5ExecutionStreamOperation` | +| Chaining | `_ANEChainingRequest` | `e5rt_execution_stream_operation` (unknown) | +| Compile | `_ANEClient.compileModel:` | `e5rt_program_library` AOT compilation | +| Sync | `_ANESharedSignalEvent` | `IOSurfaceSharedEventListener` + `MTLSharedEvent` | + +Key findings: +- `MLModel.compileModelAtURL:` produces `.mlmodelc` with `model.mil` (NOT `model.espresso.net`) +- Loading an `MLModel` creates `MLDelegateModel` -> `MLE5Engine` -> `MLE5ProgramLibrary` -> `MLE5ProgramLibraryOnDeviceAOTCompilationImpl` +- No `_ANEModel` exists anywhere in the E5 object graph +- `_ANEClient.loadModel:` / `compileModel:` both require `model.espresso.net` which isn't generated +- Prediction succeeds (model runs on ANE), confirming E5 runtime works independently of `_ANEModel` + +Internal E5 class hierarchy: +``` +MLDelegateModel + └── _internalEngine: MLE5Engine + ├── _programLibrary: MLE5ProgramLibrary + │ ├── _programLibraryHandle: e5rt_program_library* (opaque C struct) + │ ├── _impl: MLE5ProgramLibraryOnDeviceAOTCompilationImpl + │ │ ├── _milTextURL: NSURL + │ │ ├── _irProgram: shared_ptr (C++) + │ │ └── _container: MLProgramE5Container + │ └── _container: MLProgramE5Container + │ ├── _modelAssetDescription + │ ├── _compilerVersionInfo + │ └── _functionInfoArray + └── _operationPool: MLE5StaticShapeExecutionStreamOperationPool + └── _pool: NSMutableSet of MLE5ExecutionStreamOperation + ├── _operationHandle: e5rt_execution_stream_operation* (opaque) + ├── _programLibrary: MLE5ProgramLibrary + ├── _inputPorts / _outputPorts: NSArray + ├── _waitEventListener: IOSurfaceSharedEventListener + └── _completionSharedEventBoundToESOP: MTLSharedEvent +``` + +#### Experiment R: Chaining with CoreML model -- BLOCKED + +No `_ANEModel` extracted from E5 runtime, so `prepareChainingWithModel:` cannot be tested with a CoreML-compiled model. The E5 runtime is a completely separate execution path. + +#### Experiment S: Two-Kernel Chaining -- BLOCKED + +Blocked by Experiment R. The `_ANEChainingRequest` API appears to be from the **older Espresso-based runtime** and may not be usable with models compiled through the E5/MIL path. + +### Experiments T-V Results (2026-03-04) + +#### Experiment T: E5 Runtime Symbol Scan + +Found 4 exported C functions from the `e5rt_*` API: +- `e5rt_program_library_create` -- creates program library handle +- `e5rt_execution_stream_create` -- creates execution stream handle +- `e5rt_async_event_create` -- creates async event for synchronization +- `e5rt_async_event_signal` -- signals an async event + +Key ObjC classes in the E5 runtime: +- `MLE5ExecutionStreamOperation` (63 instance methods) -- holds `e5rt_execution_stream_operation*`, manages input/output ports +- `MLE5ExecutionStream` (29 instance methods) -- holds `e5rt_execution_stream*`, executes `operations` array +- `MLE5ExecutionStreamPool` -- manages streams via `takeOut` / `putBack:` +- `MLE5InputPort` / `MLE5OutputPort` -- hold `e5rt_io_port*`, bind features to ports +- `MLE5InputPortBinder` / `MLE5OutputPortBinder` -- handle memory binding for ports +- `MLE5ProgramLibrary` -- holds `e5rt_program_library*` + +Critical method: `MLE5ExecutionStream._executeStream:error:` takes `e5rt_execution_stream*` and executes **all operations** in the `operations` array in sequence. + +#### Experiment U: E5 Multi-Op Stream -- MAJOR BREAKTHROUGH + +**Successfully executed multiple ANE operations in a single E5 stream, achieving up to 4.87x speedup over sequential CoreML.** + +Method: +1. Load multiple CoreML models (`.mlpackage` -> `MLModel`) +2. Extract `MLE5ProgramLibrary` from each model's `MLE5Engine` +3. Create `MLE5ExecutionStreamOperation` for each, backed by each program library +4. Preload operations (`preloadAndReturnError:`) to compile ANE programs +5. Borrow an `MLE5ExecutionStream` from the stream pool +6. Set multiple operations on the stream via `setOperations:` +7. Prepare each operation's input features via `prepareForInputFeatures:options:error:` +8. Execute all operations in one call via `_executeStream:error:` + +#### Benchmark Results (M4 Max, macOS 15, N=500) + +| Kernels | CoreML Sequential | E5 Multi-Op Stream | Speedup | +|---------|------------------|--------------------|---------| +| 1 (256ch) | 0.0359 ms | 0.0272 ms | **1.32x** | +| 2 (256+512ch) | 0.0623 ms | 0.0406 ms | **1.53x** | +| 3 (256+512+1024ch) | 0.1599 ms | 0.0578 ms | **2.77x** | +| 4 (256+512+1024+2048ch) | 0.3781 ms | 0.0776 ms | **4.87x** | + +Key observations: +- E5 stream per-kernel overhead is remarkably consistent: ~0.02 ms/kernel regardless of count +- CoreML sequential overhead grows non-linearly (0.036 -> 0.095 ms/kernel with 4 kernels) +- The speedup increases with more kernels: the dispatch overhead is amortized +- All operations execute on ANE with a single `_executeStream:` call + +Code path for E5 multi-op stream: +``` +// 1. Extract internals from CoreML-loaded model +id e5engine = [mlModel valueForKey:@"_internalEngine"]; // MLE5Engine +id progLib = [e5engine valueForKey:@"programLibrary"]; // MLE5ProgramLibrary +id pool = [e5engine valueForKey:@"streamPool"]; // MLE5ExecutionStreamPool + +// 2. Create operation from program library +id op = [[MLE5ExecutionStreamOperation alloc] + initWithProgramLibrary:progLib functionName:@"main" + modelDescription:desc configuration:cfg + debugLabel:@"myOp" modelSignpostId:0]; +[op preloadAndReturnError:nil]; + +// 3. Get stream and set operations +id stream = [pool takeOut]; +void *sh = stream._streamHandle; // e5rt_execution_stream* +[stream setOperations:@[op1, op2, op3]]; + +// 4. Prepare and execute +for (op in operations) + [op prepareForInputFeatures:features options:predOpts error:nil]; +[stream _executeStream:sh error:nil]; +``` + +### Revised Assessment (after T-V) + +~~The **E5 runtime** (`MLE5ExecutionStream` + `MLE5ExecutionStreamOperation`) is the correct path for multi-kernel pipelining on macOS 15+.~~ **CORRECTED in Experiments W1 (see below).** + +### Experiments W1-W5: Validation & Deep API Documentation (2026-03-04) + +#### W1: Output Correctness Validation + +**CRITICAL CORRECTION**: The previously reported "4.87x speedup" from multi-op streams was **invalid**. Validation revealed: + +1. `MLE5Engine.predictionFromFeatures:options:error:` produces **EXACT** (bit-identical) output to `MLModel.predictionFromFeatures:error:` for all tested sizes (256, 512, 1024, 2048 channels). This confirms the E5 engine is the correct computation path. + +2. Our manually-created `MLE5ExecutionStreamOperation` objects via `initWithProgramLibrary:` **do not produce correct output** -- they return all zeros. The `_executeStream:` call returns YES but no actual ANE compute occurs. The operation handles are `0x0` (not compiled), meaning our manually-created ops were never wired to actual ANE programs. + +3. The "speedup" was measuring the overhead of a no-op function returning immediately vs CoreML doing actual computation. + +4. `MLE5StaticShapeExecutionStreamOperationPool.takeOutOperationForFeatures:error:` returns pool-managed operations with valid handles, but using them with `_executeStream:` still produces zeros -- the output port bindings are not correctly populated. + +5. Stream reuse via `_predictionFromFeatures:stream:options:error:` fails with "E5RT: Port bindings cannot be changed while operation is in use in an execution stream" -- streams are locked after first use and cannot be reconfigured. + +#### W1 Performance Profile + +| Path | 256ch (ms) | 2048ch (ms) | +|------|-----------|-------------| +| CoreML API (`predictionFromFeatures:error:`) | 0.035 | 0.217 | +| Engine direct (`predictionFromFeatures:options:error:`) | 0.074 | 0.284 | +| Engine private (`_predictionFromFeatures:options:error:`) | 0.100 | 0.332 | +| Stream pool cycle (takeOut + putBack) | 0.008 | 0.008 | +| Op pool cycle | <0.001 | <0.001 | + +**Key finding: CoreML API is FASTER than calling the engine directly.** `MLDelegateModel` implements internal caching (likely keeping a hot stream + operation) that avoids the per-call pool acquire/release overhead. The engine's `predictionFromFeatures:` method performs pool management on every call. + +#### W2: Exhaustive E5 Runtime API + +Full class dumps captured for all E5 runtime classes. Key classes and their roles: + +**`MLE5Engine`** (49 instance methods, 10 ivars) +- Superclass: `MLModelEngine` +- Entry point: `predictionFromFeatures:options:error:` (public), `_predictionFromFeatures:stream:options:error:` (internal) +- Key properties: `streamPool` (MLE5ExecutionStreamPool), `operationPool` (), `programLibrary` (MLE5ProgramLibrary) +- Manages: stream acquisition, operation preparation, input conforming, output post-processing + +**`MLE5ProgramLibrary`** (17 instance methods, 5 ivars) +- Holds `_programLibraryHandle` (C struct `e5rt_program_library*`) +- Key method: `createOperationForFunctionName:forceRespecialization:hasRangeShapeInputs:error:` -- returns C-level `e5rt_execution_stream_operation*` +- Contains: compiled MIL program, model configuration, implementation object + +**`MLE5ExecutionStreamOperation`** (63 instance methods, ~20 ivars) +- Holds `_operationHandle` (C struct `e5rt_execution_stream_operation*`) +- States: 0=created, transitions through prepare/execute +- Key methods: `prepareForInputFeatures:options:error:`, `preloadAndReturnError:`, `outputFeatures` +- Has input/output/state ports (MLE5InputPort, MLE5OutputPort) +- Internal binding: `_bindInputFeaturesAndWaitEvents:options:error:`, `_bindOutputPortsWithOptions:error:` +- Port binding modes: `directlyBoundFeatureValue` (zero-copy) vs `copyFeatureValue` (memcpy) + +**`MLE5ExecutionStream`** (21 instance methods, 5 ivars) +- Holds `_streamHandle` (C struct `e5rt_execution_stream*`) +- Key methods: `_executeStream:error:`, `executeForInputFeatures:options:error:`, `submitWithCompletionHandler:` +- Operations set via `setOperations:` (NSArray of MLE5ExecutionStreamOperation) +- Reset via `_cleanUpStream:` on engine + +**`MLE5ExecutionStreamPool`** (11 instance methods) +- Pool pattern: `takeOut` / `putBack:` +- Creates streams on demand with `e5rt_execution_stream_create` +- Tracks all streams via `allStreams` + +**`MLE5StaticShapeExecutionStreamOperationPool`** (17 instance methods) +- Pool for operations with fixed input shapes +- Key method: `takeOutOperationForFeatures:error:` -- matches feature shape to pooled operation + +**`MLE5InputPort` / `MLE5OutputPort`** +- Wraps `e5rt_io_port*` handles +- Each has a `binder` (MLE5InputPortBinder / MLE5OutputPortBinder) +- Input binder has `bindingMode` (char): controls copy vs direct binding +- Output binder has `outputBacking` and `featureValue` for result retrieval + +**`MLE5InputPortBinder`** (16 instance methods, 6 ivars) +- `bindingMode` (char): 0=copy, 1=direct +- `bindMemoryObjectForFeatureValue:error:` -- zero-copy IOSurface binding +- `copyFeatureValue:error:` -- memcpy binding + +**`MLE5OutputPortBinder`** (27 instance methods, 9 ivars) +- `outputBacking` -- output buffer +- `boundFeatureDirectly` (BOOL) -- tracks binding mode +- `_makeFeatureValueFromPort:featureDescription:error:` -- read ANE output + +**`MLProgramE5Container`** (11 instance methods, 6 ivars) +- Container for compiled model assets +- `URLOfMILText` -- path to MIL source +- `compilerOutput` -- `MLCompilerNeuralNetworkOutput` +- `findPrecompiledE5BundleAndReturnError:` -- looks for pre-compiled E5 bundle + +**e5rt_* C API** (found via dlsym): +- `e5rt_program_library_create` -- creates program library from MIL +- `e5rt_execution_stream_create` -- creates execution stream +- `e5rt_async_event_create` -- creates async event for synchronization +- `e5rt_async_event_signal` -- signals async event + +#### W4: Async Stream Submission + +`submitWithCompletionHandler:` **FAILED** with: "Failed to add operation to E5 stream. E5RT: Reset stream to add more operations to stream. (2)". The stream must be in a specific state (reset) before async submission is possible. The stream state becomes locked after `_executeStream:` or `executeForInputFeatures:`. + +#### W5: Port-Based Data Flow + +- Each operation has `inputPorts` (array of MLE5InputPort) and `outputPorts` (array of MLE5OutputPort) +- Input binding mode 1 = direct binding (zero-copy from MLMultiArray) +- Output `outputBacking` is nil after manual execution -- bindings are not populated by our manual path +- Port handles are `e5rt_io_port*` C structs -- connecting ports across operations would require knowing the C API for port linking + +### Revised Assessment (after W1-W5) + +1. **CoreML API is already near-optimal** for single-model inference. The `MLDelegateModel` wrapper is faster than calling engine methods directly due to internal stream/operation caching. + +2. **Manual `_executeStream:` with custom operations is invalid** -- it produces zero output. The operations must be created through the engine's internal pipeline (via `_predictionFromFeatures:stream:options:error:`) which handles binding correctly. + +3. **The opportunity for speedup lies in**: + - Eliminating ObjC overhead via direct `e5rt_*` C API calls + - Batching multiple models into a single stream (requires understanding `e5rt_execution_stream_operation` lifecycle) + - Direct MIL compilation to `e5rt_program_library` without going through CoreML + +### Experiment X1: Custom MIL -> ANE Execution (BREAKTHROUGH) + +**Pipeline discovered**: Write MIL text file -> `MLE5ProgramLibraryOnDeviceAOTCompilationImpl` -> `MLE5ProgramLibrary` -> `MLE5Engine` -> `predictionFromFeatures:` + +```objc +// 1. Write MIL text to file +NSString *mil = @"program(1.3)\n{\n func main(...) { ... } -> (cast_out);\n}\n"; +[mil writeToFile:@"/tmp/custom.mil" ...]; + +// 2. Compile MIL to E5 program library +id aotImpl = [[MLE5ProgramLibraryOnDeviceAOTCompilationImpl alloc] + initWithMILTextAtURL:milURL container:refContainer configuration:cfg]; +void *plHandle = [aotImpl createProgramLibraryHandleWithRespecialization:NO error:&err]; + +// 3. Create program library + engine +id progLib = [[MLE5ProgramLibrary alloc] initWithImpl:aotImpl container:refContainer configuration:cfg]; +id engine = [[MLE5Engine alloc] initWithProgramLibrary:progLib modelDescription:desc ...]; +[engine prepareWithConcurrencyHint:1 error:nil]; + +// 4. Execute +id result = [engine predictionFromFeatures:fp options:opts error:&err]; +``` + +**Requirements**: +- MIL input/output variable names must match the model description (e.g., `x` for input, `cast_out` for output) +- MIL shapes must match the model description shapes +- A "container" (`MLProgramE5Container`) is borrowed from a pre-compiled CoreML model (needed for compilation context) +- Input/output types should be fp32 with internal fp16 compute (cast in/out) for ANE compatibility + +**Verified kernels** (all produce EXACT correct output on ANE): + +| Kernel | MIL Op | Verification | +|--------|--------|-------------| +| ReLU | `relu(x=x16)` | Max diff = 0.000000, 0/16384 wrong | +| GELU | `gelu(x=x16, mode="TANH_APPROXIMATION")` | Verified against reference | +| Elementwise (x*2+1) | `mul` + `add` with scalar constants | Verified against reference | +| Softmax | `softmax(x=x16, axis=-1)` | Sum = 1.000000 | +| Layer Norm | `layer_norm(x=x16, axes=[3], epsilon=1e-5)` | Mean = 0.000000, Var = 0.999975 | + +**Significance**: This allows compiling **arbitrary MIL programs** (any operation supported by Apple's MIL spec) to run on the ANE, without going through CoreML's .mlpackage pipeline. This is the foundation for custom training/inference kernels. + +### Experiment Y1: Fused SDPA on ANE (PASSED) + +**Operation**: `scaled_dot_product_attention(query=Q, key=K, value=V)` -- single fused op for entire attention computation. + +Config: B=1, nHeads=1, seqLen=256, headDim=64 (self-attention: Q=K=V=reshape(input)) + +| Metric | Value | +|--------|-------| +| Max abs diff (vs CPU) | 0.000021 | +| Relative error | 1.40e-03 | +| Latency (first call) | 2.454 ms | +| **Benchmark** | **0.1708 ms/eval** | + +### Experiment Y2: Linear with Embedded Weights (PASSED) + +**Operation**: `linear(x=flat, weight=Wc, bias=Bc)` where `Wc` and `Bc` are compile-time `const` tensors embedded in the MIL program. + +Config: input [256, 64], linear 64->64 with embedded weight matrix and bias vector. + +| Metric | Value | +|--------|-------| +| Max abs diff (vs CPU) | 0.001106 | +| Relative error | 1.05e-02 | +| **Benchmark** | **0.0610 ms/eval** | + +**Significance**: Confirms that compile-time weight constants work in MIL text format. This is the foundation for transformer inference (where weights are frozen). + +### Experiment Y3: Complete Transformer Block on ANE (PASSED) + +**Pipeline**: LayerNorm -> SDPA (self-attention) -> Residual Add -> LayerNorm -> FFN (linear+GELU+linear) -> Residual Add + +All in a **single MIL program**, compiled and executed as one ANE operation. + +Config: seqLen=256, dim=64, ffnDim=128, 1-head attention, embedded FFN weights. + +| Metric | Value | +|--------|-------| +| Output mean abs | 1.017404 (non-zero, correct) | +| **Benchmark** | **0.2091 ms/eval** | + +**Significance**: A full transformer layer runs on ANE in ~0.2ms. This proves that complex multi-op pipelines can be compiled as single MIL programs with no CPU round-trips between ops. The ANE compiler fuses the entire graph. + +### Experiment Z1: Backward Pass (Gradient Computation) on ANE (PASSED) + +**Operations**: `matmul(x=dY, y=W)` for dX (input gradient), `matmul(x=dY, y=dY, transpose_x=true)` for dW (weight gradient). Both use **runtime tensors** (not const), proving backward-pass operations work on ANE. + +Also tests: `slice_by_index` for tensor slicing, `concat` for packing results. + +Config: dY [128,64] @ W [64,64] -> dX [128,64]; dY^T [64,128] @ dY [128,64] -> dW [64,64] + +| Metric | dX | dW | +|--------|-----|-----| +| Max abs diff | 0.001940 | 0.012828 | +| Relative error | 1.02e-02 | 3.92e-02 | +| **Benchmark** | **0.0593 ms/eval** (both combined) | + +**Significance**: This is the first demonstration of ANE executing gradient computation operations. The `matmul` with `transpose_x=true` works correctly, producing valid weight gradients. Combined with Y3's forward pass, this establishes the complete pipeline for manual ANE training: +1. Forward pass: Y3-style MIL (0.2 ms) +2. Backward pass: Z1-style MIL (0.06 ms) +3. Weight update: CPU (trivial) +4. Recompile: (~10-50 ms, dominates training time) + +### MIL Text Syntax Lessons Learned + +Key syntax rules discovered during Y/Z experiments: + +1. **`epsilon` in `layer_norm`**: Must be same dtype as gamma/beta. Use `fp16 eps = const()[..., val = fp16(1e-5)]` when gamma is fp16. +2. **Boolean params**: Use `bool tx = const()[..., val = bool(true)]` for params like `transpose_x`. +3. **`concat` axis**: Must be `int32` scalar, not `tensor`. Use `int32 ax = const()[..., val = int32(0)]`. +4. **`concat` interleave**: Required param, use `bool il = const()[..., val = bool(false)]`. +5. **MLE5Engine init**: Correct selector is `initWithProgramLibrary:modelDescription:configuration:functionName:classProbabilitiesFeatureName:optionalInputDefaultValues:compilerVersionInfo:` (7 args). +6. **Container path**: On macOS 15+, models may use Espresso backend. Create `MLProgramE5Container` via `initWithModelAssetPath:configuration:` using the `.mlmodelc` path. +7. **Sandbox**: E5RT needs write access to `~/Library/Caches/` for model specialization cache. + +### Next Steps + +1. **[HIGH] Multi-head attention** -- test SDPA with multiple heads (reshape to [B, nHeads, seqLen, headDim]) +2. **[HIGH] Real Qwen2.5 layer weights** -- load actual model weights into MIL const tensors +3. **[HIGH] Full backward pass** -- implement complete transformer backward pass (attention + FFN gradients) +4. **[MEDIUM] Training loop** -- forward + backward + weight update + recompile cycle +5. **[MEDIUM] Explore e5rt_* C API directly** -- bypass ObjC wrappers for lower overhead +6. **[LOW] Runtime weight injection** -- investigate if weights can be updated without recompilation + +**Phase 7: OutputSets with stats IOSurface -- BREAKTHROUGH** +``` + statsSurRef size=64 bytes: + objectWithstatsSurRef: _ANEIOSurfaceOutputSets: { statsSurRef= + id = 0x... width = 64 height = 1 pixelFormat = 0 + name = test_chaining_v2 ; outputBuffer=( + "_ANEBuffer: { ... symbolIndex=0 ; ANEBufferProducerAgent=1}" + )} + + Attempting ChainingRequest with valid outputSet... + ChainingRequest created | validate: YES <-- FIRST TIME VALIDATE PASSES! + prepareChainingWithModel EXCEPTION: + -[_ANEInMemoryModel getUUID]: unrecognized selector +``` + +**Phase 8: Disk-based _ANEModel** +``` + _ANEModel class found (12 class methods, 52 instance methods, 17 properties) + Has: getUUID, inputSymbolIndicesForProcedureIndex:, + outputSymbolIndicesForProcedureIndex:, mapper, program + Factory: +modelAtURL:key:, +modelAtURL:key:modelAttributes:, etc. + + tmpDir contents: (weights, model.mil, net.plist, data) + +modelAtURL: NOT available (needs key: parameter) + -> _ANEModel could not be loaded (need correct factory + key) +``` + +**Phase 9: processRequest via ProgramForEvaluation** +``` + k1.model.program: _ANEProgramForEvaluation: { programHandle=1319967543575 + intermediateBufferHandle=0 queueDepth=127 } + processRequest single call: YES (rv=NO) + processRequest: 0.131 ms/eval (50 iters) + vs RT eval: 1.45x (slower than RT but faster than standard) +``` + +**Phase 10: Shared Events** +``` + _ANESharedEvents: found (+sharedEventsWithSignalEvents:waitEvents:) + _ANESharedSignalEvent: found + +signalEventWithValue:symbolIndex:eventType:sharedEvent: + Properties: sharedEvent (IOSurfaceSharedEvent), value, symbolIndex, agentMask, eventType + alloc/init: nil (needs sharedEvent parameter) + _ANESharedWaitEvent: found + +waitEventWithValue:sharedEvent: + alloc/init: nil (needs sharedEvent parameter) + -> Both require IOSurfaceSharedEvent objects, not available from bare init +``` + +--- + +## 6. Architecture: Chaining Data Flow + +``` +Current (sequential): + CPU -> IOSurface -> ANE eval layer 1 -> IOSurface -> CPU memcpy + CPU -> IOSurface -> ANE eval layer 2 -> IOSurface -> CPU memcpy + ... (23 round-trips for 12-layer model) + +Target (chained): + CPU -> IOSurface -> ANE eval layer 1 -> [on-chip] -> ANE eval layer 2 + -> [on-chip] -> ... -> IOSurface -> CPU + (1 round-trip for entire model) + +Current best (sequential with standard path): + At production dims (768x256), all paths are ~0.2ms/kernel. + RT path only helps for small kernels (64x32: 1.88x speedup). + For 24 evals/token at ~0.2ms each: ~4.8ms total ANE time per token. + Chaining target: 1 round-trip instead of 24, saving ~23 x overhead per trip. +``` + +--- + +## 7. Class Hierarchy (inferred) + +``` +NSObject +├── _ANEClient (singleton, daemon connection) +├── _ANEInMemoryModelDescriptor (MIL + weights spec) +├── _ANEInMemoryModel (compile/load/run -- in-memory MIL path) +│ └── .program -> _ANEProgramForEvaluation +├── _ANEModel (disk-based compiled model -- 52 methods, has getUUID) +│ └── .program -> _ANEProgramForEvaluation +│ └── .mapper -> _ANEProgramIOSurfacesMapper +├── _ANERequest (I/O surface packaging) +├── _ANEIOSurfaceObject (thin IOSurface wrapper) +├── _ANEBuffer (IOSurfaceObject + symbolIndex + source) +├── _ANEChainingRequest (multi-op pipeline) +├── _ANEIOSurfaceOutputSets (output packaging for chaining) +├── _ANEInputBuffersReady (input signaling for chaining) +├── _ANEOutputSetEnqueue (output enqueue config for chaining) +├── _ANEProgramIOSurfacesMapper (symbol-to-surface mapping) +├── _ANEProgramForEvaluation (lower-level eval program) +├── _ANEModelInstanceParameters (model config) +├── _ANEDeviceController (device-level control) +├── _ANEQoSMapper (QoS level mapping) +├── _ANEPerformanceStats (perf counters) +├── _ANESharedSignalEvent (hardware signal fence) +└── _ANESharedWaitEvent (hardware wait fence) +``` + +--- + +## 8. MIL Operations Reference (for Custom ANE Kernels) + +Source: [coremltools MIL Ops API Reference](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html) + +The following MIL operations are available for writing custom ANE kernels via our `MLE5ProgramLibraryOnDeviceAOTCompilationImpl` pipeline (Experiment X1). All ops below have been confirmed available in the MIL text format used by the E5 compiler on macOS 15+. + +### Transformer-Critical Ops + +| Op | Signature | Notes | +|----|-----------|-------| +| `scaled_dot_product_attention` (iOS 18+) | `(query:[B,*?,L,E], key:[B,*?,S,E], value:[B,*?,S,EV], attn_mask?) -> [B,*?,L,EV]` | Fused `softmax(Q@K.T/sqrt(d))@V`. Single op for entire attention computation. | +| `linear` | `(x:[*D,D_in], weight:const[D_out,D_in], bias:const[D_out]?) -> [*D,D_out]` | `x @ W.T + b`. **Weight/bias must be compile-time constants.** Rank 1-3 input. | +| `matmul` | `(x:[*,K1], y:[*,K2], transpose_x?, transpose_y?) -> [*,T]` | N-D batch matmul with broadcasting. Supports runtime (non-const) inputs. | +| `layer_norm` | `(x, axes, gamma?, beta?, epsilon?) -> same shape` | Verified working on ANE (Experiment X1). | +| `gelu` | `(x, mode=EXACT/TANH_APPROXIMATION/SIGMOID_APPROXIMATION) -> same shape` | Verified working on ANE (Experiment X1). | +| `softmax` | `(x, axis) -> same shape` | Verified working on ANE (Experiment X1). | +| `relu` | `(x) -> same shape` | Verified working on ANE (Experiment X1). | + +### Data Movement Ops + +| Op | Signature | Notes | +|----|-----------|-------| +| `gather` | `(x, indices, axis?) -> gathered` | For embedding table lookups. | +| `gather_along_axis` | `(x, indices, axis?) -> gathered` | Take values along axis at index locations. | +| `scatter` | `(data, indices, updates, axis?, mode?) -> scattered` | For KV cache writes. Mode: update/add/sub/mul/div/max/min. | +| `scatter_along_axis` | `(data, indices, updates, axis?, mode?) -> scattered` | Scatter updates along axis. | + +### Elementwise / Reduction Ops + +| Op | Notes | +|----|-------| +| `add`, `sub`, `mul`, `real_div` | Elementwise with broadcasting. | +| `cast` | Type conversion (fp32 <-> fp16). Required for ANE I/O (fp32 in, fp16 compute, fp32 out). | +| `reduce_sum`, `reduce_mean`, `reduce_max` | Reduction along axes. | +| `rsqrt`, `sqrt`, `exp`, `log`, `tanh` | Unary elementwise. Useful for manual norm/activation implementations. | +| `concat`, `split`, `reshape`, `transpose` | Shape manipulation. | +| `slice_by_index`, `slice_by_size` | Tensor slicing for KV cache windowing. | + +### Key Constraints + +1. **`linear` weights must be `const`**: For inference this is fine (weights don't change). For training, use `matmul` with runtime tensors instead. +2. **MIL text format**: Programs use `program(1.3) { func main(...) { ... } -> (output); }` syntax. Constants use `const()[name=..., val=...]`. Weights reference blob files via `BLOBFILE(path=..., offset=...)`. +3. **ANE I/O convention**: Input/output should be fp32; internal compute should be fp16. Use `cast` ops at boundaries. +4. **Shape constraints**: ANE prefers NCHW layout. Most ops work with rank-4 tensors `[B, C, H, W]` but `linear`/`matmul` work with lower ranks. + +--- + +## 9. ANE Training Feasibility Analysis + +### Apple's Official Position + +Apple's deprecated **MLCompute** framework (`MLCDevice.ane()`) explicitly states: +> "This device applies to inference graphs only. It doesn't work with a training graph or inference graph that shares layers with a training graph." + +This means Apple never shipped ANE-based training, even in their own training framework. The `MLCTrainingGraph` class supported `executeForward`, `executeGradient`, and `executeOptimizerUpdate` but only on CPU and GPU devices. + +### WWDC 2025 Confirmation + +WWDC 2025 Session 360 ("Discover ML & AI frameworks") confirms: +- CoreML dispatches to CPU, GPU, and Neural Engine at runtime for **inference** +- MLX is the recommended tool for training/fine-tuning but uses Metal GPU, not ANE +- No mention of ANE training APIs in any Apple framework +- BNNSGraph (Accelerate) added `BNNSGraphBuilder` for CPU-only real-time inference + +### Why ANE Lacks Native Training Support + +The ANE is a fixed-function inference accelerator. It likely lacks: +- Hardware support for automatic differentiation / backward passes +- Ability to write to weight storage during execution (weights are read-only constants in the `e5rt_program_library`) +- Dynamic memory allocation needed for activation checkpointing + +### Manual ANE Training Approach + +Despite the lack of native support, training on ANE is theoretically possible using our custom MIL pipeline: + +1. **Forward pass**: Write MIL program with `linear`/`matmul`/`layer_norm`/`gelu` ops. Weights embedded as constants. Execute on ANE. Save activations. +2. **Backward pass**: Write separate MIL programs for each layer's gradient computation: + - Linear backward: `dX = dY @ W` (matmul), `dW = dY.T @ X` (matmul) + - ReLU backward: `dX = dY * (X > 0)` (elementwise) + - LayerNorm backward: Multiple reduction + elementwise ops +3. **Optimizer step**: Run on CPU (simple elementwise: `W -= lr * dW`) +4. **Recompile**: After weight update, recompile MIL with new weights for next forward pass + +The key bottleneck is step 4: recompiling MIL after every weight update. The `createProgramLibraryHandleWithRespecialization:` call takes ~10-50ms, which would dominate training time. This makes per-step ANE training impractical unless we can find a way to update weights without recompilation (e.g., via the `e5rt_*` C API or runtime weight injection). diff --git a/docs/ANE_INTERNALS.md b/docs/ANE_INTERNALS.md new file mode 100644 index 0000000..c7eae4d --- /dev/null +++ b/docs/ANE_INTERNALS.md @@ -0,0 +1,563 @@ +# ANE Internals: What We Know + +A comprehensive guide to Apple's Neural Engine (ANE) based on reverse engineering, private API exploration, and community research. This extends and updates [hollance/neural-engine](https://github.com/hollance/neural-engine/tree/master/docs) with findings from direct hardware experimentation on M4 Max / macOS 15. + +--- + +## Table of Contents + +1. [How does the ANE work internally?](#1-how-does-the-ane-work-internally) +2. [Can I program the ANE directly?](#2-can-i-program-the-ane-directly) +3. [What can be compiled and run on ANE?](#3-what-can-be-compiled-and-run-on-ane) +4. [Security and safety mechanisms](#4-security-and-safety-mechanisms) +5. [Is the ANE 16-bit?](#5-is-the-ane-16-bit) +6. [ANE vs GPU vs CPU](#6-ane-vs-gpu-vs-cpu) +7. [Reverse engineering the ANE](#7-reverse-engineering-the-ane) +8. [How to verify ANE execution](#8-how-to-verify-ane-execution) +9. [References and external resources](#9-references-and-external-resources) + +--- + +## 1. How does the ANE work internally? + +> hollance/neural-engine says: "I don't think anyone outside Apple knows." + +We now know substantially more. + +### Hardware Architecture + +The ANE is a fixed-function neural network accelerator integrated into Apple Silicon SoCs: + +| Chip | ANE Cores | Peak TOPS | SRAM Budget | +|------|-----------|-----------|-------------| +| A12-A13 | 8 | 5 | ~4 MB | +| A14/M1 | 16 | 11 | ~16 MB | +| A15/M2 | 16 | 15.8 | ~24 MB | +| M4/M4 Pro/M4 Max | 16 | 38 | ~24-32 MB | + +SRAM budget measured via `sram_probe.m` performance cliff detection on M4 Max: +- Peak efficiency at ~12.5 MB weights (282.6 GFLOPS/MB) +- First spill at ~32 MB (drops to 59.2 GFLOPS/MB) +- Catastrophic spilling at 128 MB (8.0 GFLOPS/MB) + +The ANE operates on FP16 data exclusively. All I/O is through IOSurface shared memory buffers in `[1, C, 1, S]` channel-first FP16 layout. + +### Compilation Pipeline + +There are two paths from a neural network to ANE hardware execution: + +**Standard CoreML path** (from [Black Hat Asia 2021, Wish Wu](https://infocondb.org/con/black-hat/black-hat-asia-2021/apple-neural-engine-internal-from-ml-algorithm-to-hw-registers)): + +``` +ML model (TF/PyTorch/Caffe) + -> coremltools -> .mlmodel + -> coremlc (CoreML compiler) -> .mlmodelc/ + -> espresso precompile -> net.plist + weights + -> ANECompiler (in ane_compiler_service) -> model.hwx + -> aned daemon -> H11ANEIn kernel driver (IOKit) + -> ANE firmware -> hardware registers +``` + +**Direct private API path** (what this project uses): + +``` +MIL text + weight blobs (in memory) + -> _ANEInMemoryModelDescriptor (ObjC object) + -> _ANEInMemoryModel.compileWithQoS: -> ANE binary (in temp dir) + -> _ANEInMemoryModel.loadWithQoS: -> loaded onto ANE hardware + -> _ANEInMemoryModel.evaluateWithQoS: -> execution via aned +``` + +The direct path bypasses CoreML, espresso, and the `.hwx` file format entirely. It compiles MIL (Model Intermediate Language) text directly into ANE-executable binary, loads it, and runs it. This is how we achieve both training and inference on the ANE without any CoreML dependency. + +### System Architecture + +``` ++------------------+ +------------------+ +------------------+ +| User Process | | aned daemon | | Kernel | +| | | | | | +| _ANEClient -----+---->| ANE scheduler +---->| H11ANEIn driver | +| (sharedConnection)| | (all interfaces) | | (IOKit) | +| | | | | | +| App gets 3 IOKit | | Compiles models | | Passes model.hwx | +| interfaces: | | Manages loading | | to ANE firmware | +| - open | | Handles requests | | | +| - close | +------------------+ +------------------+ +| - programSend | | +| Request | v ++------------------+ +------------------+ + | ANE Firmware | + | (co-processor) | + | | + | Parses register | + | operations from | + | compiled binary | + +------------------+ +``` + +The `aned` daemon mediates between user processes and the kernel driver. Apps only get 3 IOKit interfaces (open, close, programSendRequest). The daemon has access to all driver interfaces, which is why `_ANEClient.sharedConnection` communicates through the daemon rather than directly to the kernel. + +### Execution Paths + +We have benchmarked four distinct ways to trigger ANE kernel execution: + +| Method | API | Latency (64x32) | Latency (768x256) | +|--------|-----|------------------|--------------------| +| Standard | `model.evaluateWithQoS:options:request:error:` | 0.175 ms | 0.205 ms | +| Real-Time | `client.evaluateRealTimeWithModel:options:request:error:` | 0.093 ms | 0.246 ms | +| processRequest | `program.processRequest:model:qos:...` | 0.131 ms | 0.185 ms | +| Direct | `client.doEvaluateDirectWithModel:options:request:qos:error:` | 0.225 ms | N/A | + +**Key finding**: At production kernel dimensions (768x256, matching Stories110M), all paths converge to ~0.2 ms per kernel. The RT speedup (1.88x) observed on small 64x32 kernels does not hold at production scale. The standard path remains the most reliable. + +### Resource Limits + +The ANE runtime leaks internal resources during compilation. After ~119 compiles per process, subsequent compilations fail silently. The workaround is checkpoint-and-restart: save weights and optimizer state, terminate the process, and re-launch with `--resume`. + +With `MAX_COMPILES=100` (conservative) and 60 weight-bearing kernels per batch (12 layers x 5 kernels), only 1 training batch fits per process lifetime. + +--- + +## 2. Can I program the ANE directly? + +> hollance/neural-engine says: "Unfortunately not. You can only use the Neural Engine through Core ML." + +**Yes, you can.** The `AppleNeuralEngine.framework` contains 67+ private Objective-C classes that provide direct access to the ANE without CoreML. This project uses them for both training and inference. + +### Minimal Example + +The core compilation/load/execution cycle in pseudocode: + +```objc +#import +#import + +// Load the private framework +dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + +// Write MIL program as text +NSData *milData = [@"program(1.0) { ... }" dataUsingEncoding:NSUTF8StringEncoding]; + +// Create descriptor +id descriptor = [_ANEInMemoryModelDescriptor modelWithMILText:milData + weights:weightDict + optionsPlist:nil]; + +// Compile -> Load -> Run +id model = [_ANEInMemoryModel inMemoryModelWithDescriptor:descriptor]; +[model compileWithQoS:21 options:nil error:&error]; +[model loadWithQoS:21 options:nil error:&error]; + +// Create IOSurface I/O and request +id request = [_ANERequest requestWithInputs:@[inputSurface] + inputIndices:@[@0] + outputs:@[outputSurface] + outputIndices:@[@0] + weightsBuffer:nil + perfStats:nil + procedureIndex:0]; + +[model evaluateWithQoS:21 options:nil request:request error:&error]; +``` + +A complete reusable wrapper is implemented in [`training/ane_runtime.h`](../training/ane_runtime.h) with functions: +- `ane_init()` -- load framework, resolve classes +- `ane_compile(kernel, mil_text, weight_dict)` -- compile MIL to ANE binary +- `ane_run(kernel)` -- standard execution path +- `ane_free(kernel)` -- unload and release resources + +### MIL (Model Intermediate Language) + +MIL is Apple's intermediate representation for neural network operations. Key facts: + +- Text-based format: `program(1.0) { func main(...) { ... } }` +- Targets: `ios16`, `ios17`, `ios18` (determines available ops) +- All tensors are 4D: `[batch, channels, height, width]` or equivalently `[1, C, 1, S]` +- Convolutions (`conv`) are the workhorse: a 1x1 conv with `[out_ch, in_ch, 1, 1]` weights = matrix multiply +- Weights referenced via `BLOBFILE(path="@model_path/weights/name.bin", offset=uint64(64))` +- Weights are baked at compile time and cannot be swapped at runtime + +Supported operations include: `conv`, `matmul`, `add`, `mul`, `sigmoid`, `softmax`, `reshape`, `transpose`, `concat`, `reduce_mean`, `rsqrt`, `cast`, `constexpr_affine_dequantize`, and more. + +### Alternative: ANECompiler CLI + +[ANETools](https://github.com/antgroup-skyward/ANETools) (from Wish Wu / Ant Group) provides command-line tools that invoke the ANECompiler module directly: + +```bash +# Convert mlmodelc to ANE-compatible format +MLModelCToANECompiler input.mlmodelc output/ + +# Compile to hardware format +ANECompiler --target-arch ane_v5 --debug-mask 2147483647 net.plist weights/ output.hwx + +# Disassemble compiled binary +ANEDisassembler output.hwx +``` + +The `--debug-mask` flag (set to max integer) generates intermediate files during compilation, revealing internal register operations. + +--- + +## 3. What can be compiled and run on ANE? + +Any computation expressible as a static MIL (Model Intermediate Language) dataflow graph that the E5 compiler accepts. The ANE is a fixed-function accelerator, not a general-purpose processor -- it executes predefined operation graphs, not arbitrary code. + +### Verified Operations + +These operations have been compiled to custom MIL programs and executed on ANE hardware with output validated against CPU reference implementations (see `test_mil_custom.m`): + +| Category | Operations | Notes | +|----------|-----------|-------| +| Activations | `relu`, `gelu`, `softmax` | GELU supports EXACT, TANH_APPROXIMATION, SIGMOID_APPROXIMATION modes | +| Normalization | `layer_norm` | Epsilon type must match gamma/beta dtype | +| Attention | `scaled_dot_product_attention` | Fused Q@K^T/sqrt(d) + softmax + @V in a single op (iOS 18+) | +| Linear algebra | `linear` (const weights), `matmul` (runtime tensors) | `linear` requires compile-time constant weights; `matmul` supports runtime inputs | +| Type conversion | `cast` | fp32 <-> fp16. Required at ANE I/O boundaries | +| Elementwise | `add`, `mul`, `real_div` | Broadcasting supported | +| Shape | `reshape`, `transpose`, `concat`, `slice_by_index` | `concat` requires `interleave` param | +| Composite | Full transformer block (LN + SDPA + Residual + FFN + GELU) | Compiles and runs as a single ANE program (~0.21ms) | + +### Available but Not Yet Tested + +These are valid MIL operations that the E5 compiler should accept: + +- `conv` -- convolutions (the upstream maderix/ANE repo uses these extensively for training) +- `reduce_sum`, `reduce_mean`, `reduce_max` -- reductions +- `gather`, `scatter` -- embedding lookups, KV cache writes +- `rsqrt`, `sqrt`, `exp`, `log`, `tanh` -- unary math +- `split`, `slice_by_size` -- tensor slicing +- `batch_norm`, `instance_norm` -- normalization variants +- Various pooling, padding, upsampling operations + +### What Cannot Run on ANE + +| Limitation | Detail | +|-----------|--------| +| No control flow | No loops, conditionals, or branching. MIL is a static dataflow graph. | +| No dynamic shapes | All tensor dimensions must be known at compile time. | +| No runtime weight updates | Weights are `const`, baked into the compiled binary. Changing weights requires recompilation (~10-50ms). | +| No arbitrary memory access | No pointers or indexing beyond what `gather`/`scatter` provide. | +| No custom ops | Only operations in Apple's MIL op set. No user-defined kernels at the hardware level. | +| No FP32 compute | ANE computes in FP16 only. FP32 inputs are cast to FP16 internally. | + +### Implications for Training + +The ANE can execute the forward pass and the matrix math of backpropagation (`matmul` for dX and dW gradients). However, training is impractical because weights are read-only constants. After computing weight gradients on ANE, the optimizer step (W -= lr * dW) must run on CPU, and the MIL program must be recompiled with updated weights before the next forward pass. This recompilation costs ~10-50ms per step, dominating training time. See [ANE_CHAINING_RESEARCH.md, Section 9](ANE_CHAINING_RESEARCH.md#9-ane-training-feasibility-analysis) for detailed analysis. + +--- + +## 4. Security and Safety Mechanisms + +The ANE has multiple layers of safety enforcement, but Apple's security model assumes access goes through CoreML. The private APIs we use bypass CoreML but still pass through the `aned` daemon and the E5 compiler. + +### Compile-Time Safety + +| Mechanism | What it does | +|-----------|-------------| +| MIL syntax validation | The E5 compiler rejects malformed MIL with `InvalidMILProgram` errors | +| Type checking | Tensor dtypes, shapes, and parameter types must match exactly. Mismatches cause compile errors (e.g., `layer_norm` epsilon must match gamma/beta dtype; `concat` axis must be `int32` scalar, not tensor) | +| Op validation | Unknown or unsupported operations are rejected | +| I/O matching | MIL input/output names and shapes must match the `MLModelDescription` passed to `MLE5Engine` | + +### Runtime Safety + +| Mechanism | What it does | +|-----------|-------------| +| Shape enforcement | Input tensors must match declared shape exactly -- `MultiArray shape doesn't match ML Program's expected shape` error on mismatch | +| Daemon mediation | ANE runs through the `aned` daemon (system service). User processes only get 3 IOKit interfaces: open, close, `programSendRequest` | +| IOSurface isolation | I/O memory is managed by the kernel via IOSurface. Cannot read/write arbitrary memory through them | +| SRAM limits | Programs exceeding the ANE SRAM budget (~24-32MB on M4 Max) are rejected or fall back to CPU/GPU | +| Compile limit | ~119 compiled programs per process before the compiler leaks enough resources to fail (resource exhaustion, not a security boundary) | + +### Sandbox Interaction + +The E5 runtime needs write access to `~/Library/Caches//` for its ANE specialization cache. macOS app sandbox can block this, causing compilation to fail with permission errors. When running outside a sandbox (e.g., command-line tools), this directory is created automatically. + +### What is NOT Protected + +| Gap | Detail | +|-----|--------| +| No access control | No authentication or entitlement check for using the private APIs. Any process can call `_ANEClient.sharedConnection` | +| No rate limiting | Programs can be compiled in a loop until the ~119 limit exhausts resources | +| No MIL signing | No code signing validation on MIL text -- any syntactically valid program that passes the compiler's type checks will execute | +| No isolation between programs | Multiple programs from the same process share the ANE with no hardware-level isolation (the daemon schedules them) | + +### Practical Risk Assessment + +The ANE attack surface is limited because: + +1. **Fixed-function hardware**: The ANE executes predefined neural network operations, not arbitrary instructions. There is no instruction pointer, no stack, and no way to jump to arbitrary code. +2. **Typed dataflow**: MIL programs operate on typed tensors with fixed shapes. There are no buffer overflows in the traditional sense -- the compiler enforces all dimensions at compile time. +3. **Daemon intermediary**: All ANE access goes through `aned`, which validates requests before forwarding to the kernel driver. Direct IOKit access to the ANE is restricted to 3 interfaces. +4. **No persistent state**: ANE programs don't persist across reboots. Compiled programs live in temp directories and caches that are cleaned by the OS. + +The main risk of the private APIs is **stability**: these APIs are undocumented and may change with any macOS update, potentially breaking programs that depend on them. + +--- + +## 5. Is the ANE 16-bit? + +> hollance/neural-engine says: "It appears so." + +**Confirmed.** The ANE operates in FP16 for both compute and storage: + +- All IOSurface I/O must be FP16. Passing FP32 data produces zeros. +- MIL programs must use `fp16` I/O types (setting `g_fp16_io=1` in our codebase) +- F32-to-F16 conversion happens on the CPU before writing to IOSurfaces +- FP16 precision limits: values above ~65504 overflow, values below ~5.96e-8 underflow to zero + +### Quantization Support + +| Format | ANE Native? | Notes | +|--------|------------|-------| +| FP16 | Yes | Native compute and storage format | +| INT8 | Partial | Memory bandwidth savings only, no compute speedup. `constexpr_affine_dequantize` in MIL dequantizes to FP16 before compute | +| Q4 | No | Not supported. Requires GPU (Metal) or CPU dequantization | +| FP32 | No | Internally converted to FP16; higher precision lost | + +Apple markets ANE TOPS using INT8, so the 38 TOPS figure for M4 is really ~19 TFLOPS in FP16 (each INT8 op counts as 1 TOP but FP16 ops count as 2). + +--- + +## 6. ANE vs GPU vs CPU + +Benchmarked on Qwen2.5-0.5B (dim=896, 24 layers, 494M params) on M4 Max: + +### Decode Performance (single-token generation) + +| Engine | Format | Weight Size | Decode t/s | Bottleneck | +|--------|--------|-------------|------------|------------| +| CPU AMX (cblas_sgemv) | F32 | 1.97 GB | ~91 t/s | Memory bandwidth | +| CPU AMX (cblas_sgemv) | F16->F32 | 658 MB disk | ~91 t/s | Memory bandwidth (F32 in RAM) | +| CPU AMX (cblas_sgemv) | Q4->F32 | 188 MB disk | ~91 t/s | Memory bandwidth (dequant at load) | +| Metal GPU (Q4 SIMD) | Q4 | 188 MB | ~10 t/s | Dispatch overhead (~400 dispatches/token) | +| LM Studio (MLX) | Q4 MLX | ~188 MB | 258-496 t/s | Optimized Metal kernels | + +### Prefill Performance (batch prompt processing) + +| Engine | Format | Prefill t/s | Method | +|--------|--------|-------------|--------| +| CPU AMX (cblas_sgemm) | F32 | 880-960 t/s | Batched matmul | +| CPU AMX (cblas_sgemv) | F32 | ~40 t/s | Sequential per-token | + +### ANE Training Kernel Performance + +| Metric | Value | +|--------|-------| +| Kernel latency | ~0.2 ms per kernel (768x256 production dims) | +| Peak TFLOPS | 11.14 (128x conv 512ch sp64) | +| Sustained training | 1.29-1.68 TFLOPS | +| ANE utilization | 8-11% of peak | + +### When to use each + +- **ANE**: Best for parallel FP16 operations where data stays on-chip (training kernels, fused attention). The ~119 compile limit and FP16-only restriction are significant constraints. +- **GPU (Metal)**: Best for large models (dim >= 4096) where native quantized matmul kernels (as in MLX/llama.cpp) can read Q4/Q8 data directly from GPU memory. Dispatch overhead dominates for small models. +- **CPU AMX**: Best for small/medium model decode (dim <= 896). `cblas_sgemv` uses the AMX coprocessor internally and achieves ~33% of theoretical bandwidth. Cannot be beaten by manual NEON, threading, or Metal for this model size. + +--- + +## 7. Reverse engineering the ANE + +### Prior Work + +| Project | Focus | Key Contribution | +|---------|-------|-------------------| +| [hollance/neural-engine](https://github.com/hollance/neural-engine) | CoreML-level documentation | Comprehensive device list, layer compatibility, model surgery guides | +| [geohot/tinygrad ANE](https://github.com/tinygrad/tinygrad) | Driver-level reverse engineering | Initial IOKit driver analysis, ANE instruction format exploration | +| [Black Hat Asia 2021 (Wish Wu)](https://infocondb.org/con/black-hat/black-hat-asia-2021/apple-neural-engine-internal-from-ml-algorithm-to-hw-registers) | Full stack: ML to HW registers | Documented compilation pipeline, .hwx format, security attack surfaces, FaceID ANE usage. Created ANEDisassembler. [Video](https://www.youtube.com/watch?v=1wvBDUnPNEo) | +| [ANETools](https://github.com/antgroup-skyward/ANETools) | CLI compilation and disassembly | ANECompiler CLI wrapper, ANEDisassembler for .hwx files, `debug_mask` flag for intermediate output | +| [eiln/anecc](https://github.com/eiln/anecc) | Independent ANE compiler | CoreML-to-ANE compiler for Asahi Linux, alternative compilation path | +| [freedomtan/coreml_to_ane_hwx](https://github.com/freedomtan/coreml_to_ane_hwx) | CoreML to .hwx conversion | Direct converter bypassing some CoreML steps | +| [maderix/ANE](https://github.com/maderix/ANE) | Training on ANE | First neural network training on ANE via private APIs | +| [maderix Substack](https://open.substack.com/pub/maderix/p/inside-the-m4-apple-neural-engine) | M4 ANE deep-dive | Detailed M4 ANE architecture analysis, SRAM probing, kernel fusion | + +### Our Discoveries: Private API Class Hierarchy + +We have documented 20+ private Objective-C classes in `AppleNeuralEngine.framework`: + +``` +NSObject +|-- _ANEClient (singleton, daemon connection) +| Methods: sharedConnection, evaluateWithModel:, evaluateRealTimeWithModel:, +| doEvaluateDirectWithModel:, prepareChainingWithModel:, +| enqueueSetsWithModel:, buffersReadyWithModel:, +| beginRealTimeTask, endRealTimeTask +| +|-- _ANEInMemoryModelDescriptor (MIL + weights spec) +| Factory: +modelWithMILText:weights:optionsPlist: +| +|-- _ANEInMemoryModel (compile/load/run) +| Methods: compileWithQoS:, loadWithQoS:, evaluateWithQoS:, unloadWithQoS: +| Props: hexStringIdentifier, programHandle (uint64), program, perfStatsMask +| +|-- _ANEModel (disk-based compiled model -- 52 instance methods) +| Factory: +modelAtURL:key:, +modelAtURL:key:modelAttributes: +| Methods: getUUID, inputSymbolIndicesForProcedureIndex:, +| outputSymbolIndicesForProcedureIndex: +| Props: mapper, program +| +|-- _ANERequest (I/O surface packaging) +| Factory: +requestWithInputs:inputIndices:outputs:outputIndices: +| weightsBuffer:perfStats:procedureIndex: +| +|-- _ANEIOSurfaceObject (thin IOSurface wrapper) +| Factory: +objectWithIOSurface: +| +|-- _ANEBuffer (IOSurfaceObject + symbolIndex + source) [KEY DISCOVERY] +| Factory: +bufferWithIOSurfaceObject:symbolIndex:source: +| source: 0=ANE, 1=output, 2=unknown +| +|-- _ANEChainingRequest (multi-op pipeline) +| Factory: +chainingRequestWithInputs:outputSets:lbInputSymbolId: +| lbOutputSymbolId:procedureIndex:signalEvents: +| transactionHandle:fwEnqueueDelay:memoryPoolId: +| Methods: validate +| +|-- _ANEIOSurfaceOutputSets (output packaging for chaining) +| Factory: +objectWithstatsSurRef:outputBuffer: +| Note: requires non-NULL statsSurRef (any IOSurface works, even 64 bytes) +| +|-- _ANEInputBuffersReady (input signaling for chaining) +| Factory: +inputBuffersWithProcedureIndex:inputBufferInfoIndex: +| inputFreeValue:executionDelay: +| +|-- _ANEOutputSetEnqueue (output pipeline config for chaining) +| Factory: +outputSetWithProcedureIndex:setIndex:signalValue: +| signalNotRequired:isOpenLoop: +| +|-- _ANEProgramForEvaluation (lower-level program) +| Factory: +programWithHandle:intermediateBufferHandle:queueDepth: +| Methods: processRequest:model:qos:qIndex:modelStringID:options: +| returnValue:error: +| +|-- _ANEProgramIOSurfacesMapper (symbol-to-surface mapping) +| Factory: +mapperWithProgramHandle:, +mapperWithController: +| Note: only works with _ANEModel, not _ANEInMemoryModel +| +|-- _ANEPerformanceStats +| Factory: +statsWithHardwareExecutionNS: +| Props: hwExecutionTime, performanceCounters +| +|-- _ANESharedSignalEvent (hardware signal fence) +| Factory: +signalEventWithValue:symbolIndex:eventType:sharedEvent: +| Requires IOSurfaceSharedEvent objects +| +|-- _ANESharedWaitEvent (hardware wait fence) +| Factory: +waitEventWithValue:sharedEvent: +| Requires IOSurfaceSharedEvent objects +| +|-- _ANEModelInstanceParameters, _ANEDeviceController, _ANEQoSMapper +``` + +Full details with experiment logs: [ANE_CHAINING_RESEARCH.md](ANE_CHAINING_RESEARCH.md) + +### ChainingRequest API Status + +The `_ANEChainingRequest` API is designed to pipeline multiple ANE operations without CPU round-trips. Current status: + +- `_ANEChainingRequest.validate` returns **YES** (with `_ANEBuffer` inputs + `_ANEIOSurfaceOutputSets` outputs) +- `prepareChainingWithModel:` **fails** -- calls `getUUID` on `_ANEInMemoryModel` which lacks it +- Requires `_ANEModel` (disk-based compiled model) which has `getUUID` and symbol index methods +- `_ANEModel` factory methods require a `key:` parameter; the hex identifier from `_ANEInMemoryModel` is the likely key + +This is the highest-priority research area. Chaining would eliminate the ~23 CPU-ANE round-trips per token in a 12-layer model, potentially enabling on-chip pipeline execution. + +### model.hwx Binary Format + +The `.hwx` file is the compiled hardware representation loaded by the ANE kernel driver. From Wu's Black Hat research: + +- Mach-O format binary containing register operations +- Compiled from `net.plist` + weights by the ANECompiler module +- Loaded by the `H11ANEIn` kernel driver via `programCreate` interface +- ANE firmware parses it to extract register addresses and values +- Can be disassembled with [ANETools/ANEDisassembler](https://github.com/antgroup-skyward/ANETools) + +Our `_ANEInMemoryModel` path bypasses `.hwx` generation -- the model goes directly from MIL to an internal binary format in a temp directory. Whether this temp directory contains an equivalent to `.hwx` is an open question (see [ANE_CHAINING_RESEARCH.md](ANE_CHAINING_RESEARCH.md) for next steps). + +--- + +## 8. How to verify ANE execution + +### Power Monitoring + +```bash +sudo powermetrics --samplers ane_power -i 1000 +``` + +Shows real-time ANE power draw. Active ANE usage typically shows 2-4W on M4 Max during training. + +### Performance Statistics + +```objc +model.perfStatsMask = 0xFF; +// After execution: +// model.performanceCounters -- returns nil on current macOS (limited API) +``` + +The `_ANEPerformanceStats` class exists and can be instantiated via `+statsWithHardwareExecutionNS:`, but the hardware counters are not populated on the current macOS/M4 combination. The `perfStatsMask` property is accepted but `performanceCounters` returns nil after execution. + +### IOSurface Output Validation + +Read back FP16 data from output IOSurfaces and compare against CPU reference: + +```objc +_Float16 *out = (_Float16 *)IOSurfaceGetBaseAddress(surface); +IOSurfaceLock(surface, kIOSurfaceLockReadOnly, NULL); +for (int i = 0; i < n; i++) { + float val = (float)out[i]; + // Compare against CPU reference +} +IOSurfaceUnlock(surface, kIOSurfaceLockReadOnly, NULL); +``` + +### ANE Compiler Debug Output + +From Wu's research, the ANECompiler module has a `debug_mask` flag. Setting it to `2147483647` (max int) generates intermediate files during compilation, revealing: +- Register operation sequences +- Memory allocation decisions +- Tiling strategies +- Weight layout in SRAM + +This can be applied when using the ANECompiler CLI tools from [ANETools](https://github.com/antgroup-skyward/ANETools). + +--- + +## 9. References and External Resources + +### Documentation and Research + +| Resource | URL | Focus | +|----------|-----|-------| +| hollance/neural-engine | https://github.com/hollance/neural-engine | CoreML-level ANE docs | +| maderix Substack | https://open.substack.com/pub/maderix/p/inside-the-m4-apple-neural-engine | M4 ANE architecture | +| Black Hat Asia 2021 | https://infocondb.org/con/black-hat/black-hat-asia-2021/apple-neural-engine-internal-from-ml-algorithm-to-hw-registers | Full stack reverse engineering | +| BH Asia 2021 Video | https://www.youtube.com/watch?v=1wvBDUnPNEo | 30-min talk by Wish Wu | +| Apple ML Research | https://machinelearning.apple.com/research/neural-engine-transformers | Deploying transformers on ANE | +| ANE Supported Devices | https://github.com/hollance/neural-engine/blob/master/docs/supported-devices.md | Comprehensive device/chip list | + +### Tools + +| Tool | URL | Purpose | +|------|-----|---------| +| ANETools | https://github.com/antgroup-skyward/ANETools | ANECompiler CLI, ANEDisassembler | +| eiln/anecc | https://github.com/eiln/anecc | Independent ANE compiler (Asahi Linux) | +| freedomtan/coreml_to_ane_hwx | https://github.com/freedomtan/coreml_to_ane_hwx | CoreML to .hwx converter | +| coremltools | https://github.com/apple/coremltools | Apple's official ML model tools | + +### Projects Using ANE Directly + +| Project | URL | What it does | +|---------|-----|-------------| +| maderix/ANE | https://github.com/maderix/ANE | Training on ANE (this project's upstream) | +| dev-erik/ANE | https://github.com/dev-erik/ANE | This fork: inference optimization, ChainingRequest research | + +### This Project's ANE Documentation + +| Document | Description | +|----------|-------------| +| [ANE_INTERNALS.md](ANE_INTERNALS.md) | This file -- comprehensive ANE internals guide | +| [ANE_CHAINING_RESEARCH.md](ANE_CHAINING_RESEARCH.md) | ChainingRequest API research, experiment logs, benchmarks | +| [ARCHITECTURE.md](ARCHITECTURE.md) | Training system architecture, kernel fusion map, data flow | +| [API_REFERENCE.md](API_REFERENCE.md) | Complete function index for all source files | +| [BENCHMARK_RESULTS.md](BENCHMARK_RESULTS.md) | M4 Max benchmark results (training, TFLOPS, SRAM) | diff --git a/training/Makefile b/training/Makefile index 7f16c1a..bfc72aa 100644 --- a/training/Makefile +++ b/training/Makefile @@ -1,14 +1,21 @@ CC = xcrun clang -CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc +CC_C = xcrun clang + +ANE_COMPAT = -Wno-deprecated-declarations +SEC_FLAGS = -fstack-protector-strong -Wformat-security + +CFLAGS = -O2 -Wall $(ANE_COMPAT) -fobjc-arc $(SEC_FLAGS) +CFLAGS_C = -O2 -Wall -Wextra -Werror -std=c11 +CFLAGS_DEBUG = -O0 -g -Wall $(ANE_COMPAT) -fobjc-arc -fsanitize=address,undefined FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface LDFLAGS = $(FRAMEWORKS) -ldl -HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h +HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h data_validation.h HEADERS_ANE = $(HEADERS_LARGE) ane_rmsnorm_bwd.h ane_classifier.h train: train.m ane_runtime.h ane_mil_gen.h model.h forward.h backward.h - $(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS) + $(CC) $(CFLAGS) -o $@ train.m $(LDFLAGS) -framework Accelerate train_large: train_large.m $(HEADERS_LARGE) $(CC) $(CFLAGS) -o $@ train_large.m $(LDFLAGS) -framework Accelerate @@ -16,6 +23,14 @@ train_large: train_large.m $(HEADERS_LARGE) train_large_ane: train_large_ane.m $(HEADERS_ANE) $(CC) $(CFLAGS) -o $@ train_large_ane.m $(LDFLAGS) -framework Accelerate +HEADERS_OPT = $(HEADERS_LARGE) stories_cpu_ops_opt.h + +train_opt: train_opt.m $(HEADERS_OPT) + $(CC) $(CFLAGS) -o $@ train_opt.m $(LDFLAGS) -framework Accelerate -framework Metal -framework MetalPerformanceShaders + +train_double_buffer: train_double_buffer.m $(HEADERS_LARGE) + $(CC) $(CFLAGS) -o $@ train_double_buffer.m $(LDFLAGS) -framework Accelerate + PROBES = test_weight_reload test_perf_stats test_qos_sweep test_ane_advanced test_rmsnorm_bwd: test_rmsnorm_bwd.m $(HEADERS_ANE) @@ -36,13 +51,56 @@ test_qos_sweep: test_qos_sweep.m test_ane_advanced: test_ane_advanced.m $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) +test_chaining: test_chaining.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_chaining_v2: test_chaining_v2.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_bench_paths: test_bench_paths.m ane_runtime.h + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_ane_model: test_ane_model.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Metal + +test_throughput_ceiling: test_throughput_ceiling.m ane_runtime.h + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_coreml_chaining: test_coreml_chaining.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Metal + +test_e5_validate: test_e5_validate.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Metal + +test_mil_custom: test_mil_custom.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -framework Accelerate + +test_data_validation: test_data_validation.c data_validation.h + $(CC_C) $(CFLAGS_C) -o $@ $< + probes: $(PROBES) +security-tests: test_data_validation + +data: tokenize + @bash download_data.sh + tokenize: python3 tokenize.py -clean: - rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier +setup: data + @echo "=== Setup complete ===" + @echo "Data: tinystories_data00.bin" + @echo "To train: make train_large && ./train_large" + @echo "Override paths: ANE_MODEL_PATH=... ANE_DATA_PATH=... ./train_large" + +verify-flags: + @echo "=== Active CFLAGS ===" + @echo "$(CFLAGS)" + @echo "=== Compiler version ===" + @xcrun clang --version -.PHONY: clean tokenize probes +clean: + rm -f train train_large train_large_ane train_opt train_double_buffer $(PROBES) test_rmsnorm_bwd test_classifier test_data_validation test_chaining test_chaining_v2 test_bench_paths test_ane_model test_throughput_ceiling test_coreml_chaining test_e5_validate test_mil_custom +.PHONY: clean tokenize probes security-tests verify-flags data setup diff --git a/training/ane_runtime.h b/training/ane_runtime.h index 58bcb79..8a70b99 100644 --- a/training/ane_runtime.h +++ b/training/ane_runtime.h @@ -20,15 +20,33 @@ typedef struct { static Class g_ANEDesc, g_ANEInMem, g_ANEReq, g_ANEIO; static bool g_ane_loaded = false; +static id g_ane_client = nil; +static bool g_ane_ok = false; static void ane_init(void) { if (g_ane_loaded) return; - dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + g_ane_loaded = true; // Set first to prevent re-entry (ref: CRIT-01) + void *handle = dlopen( + "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", + RTLD_NOW); + if (!handle) { + fprintf(stderr, "ANE: dlopen failed: %s\n", dlerror()); + return; + } g_ANEDesc = NSClassFromString(@"_ANEInMemoryModelDescriptor"); g_ANEInMem = NSClassFromString(@"_ANEInMemoryModel"); g_ANEReq = NSClassFromString(@"_ANERequest"); g_ANEIO = NSClassFromString(@"_ANEIOSurfaceObject"); - g_ane_loaded = true; + if (!g_ANEDesc || !g_ANEInMem || !g_ANEReq || !g_ANEIO) { + fprintf(stderr, "ANE: Private classes not found (macOS version mismatch?)\n"); + return; + } + g_ane_ok = true; + + Class clientCls = NSClassFromString(@"_ANEClient"); + if (clientCls) { + g_ane_client = [clientCls performSelector:@selector(sharedConnection)]; + } } static IOSurfaceRef ane_create_surface(size_t bytes) { @@ -50,6 +68,7 @@ static ANEKernel *ane_compile(NSData *milText, NSData *weightData, int nInputs, size_t *inputSizes, int nOutputs, size_t *outputSizes) { ane_init(); + if (!g_ane_ok) { fprintf(stderr, "ANE: not available\n"); return NULL; } // CRIT-01/02 NSError *e = nil; NSDictionary *wdict = nil; @@ -63,6 +82,7 @@ static ANEKernel *ane_compile(NSData *milText, NSData *weightData, id mdl = ((id(*)(Class,SEL,id))objc_msgSend)( g_ANEInMem, @selector(inMemoryModelWithDescriptor:), desc); + if (!mdl) { fprintf(stderr, "ANE: inMemoryModel allocation failed\n"); return NULL; } // CRIT-02 // Pre-populate temp dir with MIL + weights id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); @@ -151,6 +171,20 @@ static bool ane_eval(ANEKernel *k) { return ok; } +static bool ane_eval_rt(ANEKernel *k) { + if (!g_ane_client) return ane_eval(k); + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + g_ane_client, @selector(evaluateRealTimeWithModel:options:request:error:), + k->model, @{}, k->request, &e); + if (!ok) { + fprintf(stderr, "ANE RT eval failed, falling back to standard: %s\n", + e ? [[e description] UTF8String] : "unknown"); + return ane_eval(k); + } + return true; +} + static void ane_free(ANEKernel *k) { if (!k) return; NSError *e = nil; diff --git a/training/test_ane_model.m b/training/test_ane_model.m new file mode 100644 index 0000000..e18e94c --- /dev/null +++ b/training/test_ane_model.m @@ -0,0 +1,2260 @@ +// test_ane_model.m — Experiments E-H: _ANEModel loading, ANECompiler, chaining, shared events +// Build: make test_ane_model && ./test_ane_model +#import +#import +#import +#import +#import +#import +#include + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +__attribute__((unused)) static int g_fp16_io = 1; + +#pragma mark - Helpers + +static void dump_class(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) { printf(" %s: NOT FOUND\n", name); return; } + printf("\n=== %s ===\n", name); + + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(cls), &count); + if (count) printf(" Class methods (%u):\n", count); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + methods = class_copyMethodList(cls, &count); + if (count) printf(" Instance methods (%u):\n", count); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + if (pcount) printf(" Properties (%u):\n", pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + const char *pattr = property_getAttributes(props[i]); + printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); + } + free(props); +} + +static void dump_all_properties(id obj, Class cls) { + if (!obj) return; + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [obj valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname, [[ex reason] UTF8String]); + } + } + free(props); +} + +static void list_dir_recursive(NSString *path, int depth) { + NSFileManager *fm = [NSFileManager defaultManager]; + NSArray *items = [fm contentsOfDirectoryAtPath:path error:nil]; + for (NSString *item in items) { + NSString *full = [path stringByAppendingPathComponent:item]; + BOOL isDir = NO; + [fm fileExistsAtPath:full isDirectory:&isDir]; + NSDictionary *attrs = [fm attributesOfItemAtPath:full error:nil]; + unsigned long long sz = [attrs fileSize]; + for (int i = 0; i < depth; i++) printf(" "); + if (isDir) { + printf(" [DIR] %s/\n", [item UTF8String]); + list_dir_recursive(full, depth + 1); + } else { + printf(" %s (%llu bytes)\n", [item UTF8String], sz); + } + } +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +#pragma mark - MIL Generation (FP16 conv) + +static NSString *gen_conv_mil(int ch, int sp) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>" + "({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\")," + " val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\")," + " val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\")," + " val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\")," + " val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\")," + " val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor" + "(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr," + "pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; +} + +#pragma mark - Kernel Compilation + +typedef struct { + id model; + IOSurfaceRef ioIn, ioOut; + NSString *tmpDir; + NSString *hexId; + int ch, sp; +} CompiledKernel; + +static CompiledKernel compile_kernel(int ch, int sp) { + CompiledKernel k = {0}; + k.ch = ch; k.sp = sp; + + Class gD = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class gI = NSClassFromString(@"_ANEInMemoryModel"); + if (!gD || !gI) { printf(" ERROR: ANE classes not found\n"); return k; } + + int ws = ch * ch * 2; + int tot = 128 + ws; + uint8_t *blob = (uint8_t *)calloc((size_t)tot, 1); + blob[0] = 1; blob[4] = 2; + blob[64] = 0xEF; blob[65] = 0xBE; blob[66] = 0xAD; blob[67] = 0xDE; + blob[68] = 1; + *(uint32_t *)(blob + 72) = (uint32_t)ws; + *(uint32_t *)(blob + 80) = 128; + _Float16 *wp = (_Float16 *)(blob + 128); + for (int i = 0; i < ch; i++) wp[i * ch + i] = (_Float16)1.0f; + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:(NSUInteger)tot + freeWhenDone:YES]; + + NSString *mil = gen_conv_mil(ch, sp); + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(gD, + @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(gI, + @selector(inMemoryModelWithDescriptor:), desc); + if (!mdl) { printf(" ERROR: inMemoryModel creation failed\n"); return k; } + + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + k.hexId = hx; + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] + atomically:YES]; + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok) { + printf(" Compile failed: %s\n", e ? [[e description] UTF8String] : "unknown"); + return k; + } + + ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + if (!ok) { + printf(" Load failed: %s\n", e ? [[e description] UTF8String] : "unknown"); + return k; + } + + k.model = mdl; + k.ioIn = make_surface((size_t)ch * sp * 2); + k.ioOut = make_surface((size_t)ch * sp * 2); + k.tmpDir = td; + return k; +} + +static void free_kernel(CompiledKernel *k) { + if (!k->model) return; + NSError *e = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)( + k->model, @selector(unloadWithQoS:error:), 21, &e); + if (k->ioIn) CFRelease(k->ioIn); + if (k->ioOut) CFRelease(k->ioOut); +} + +#pragma mark - Main + +int main(int argc, const char *argv[]) { + (void)argc; (void)argv; + @autoreleasepool { + mach_timebase_info(&g_tb); + printf("==============================================================\n"); + printf(" ANE Experiments E-H: _ANEModel, Compiler, Chaining\n"); + printf("==============================================================\n\n"); + + void *handle = dlopen( + "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/" + "AppleNeuralEngine", RTLD_NOW); + if (!handle) { printf("FATAL: dlopen ANE framework failed\n"); return 1; } + + Class gAIO = NSClassFromString(@"_ANEIOSurfaceObject"); + Class gAR = NSClassFromString(@"_ANERequest"); + Class gBuf = NSClassFromString(@"_ANEBuffer"); + Class gOutSets = NSClassFromString(@"_ANEIOSurfaceOutputSets"); + Class gChain = NSClassFromString(@"_ANEChainingRequest"); + id client = [NSClassFromString(@"_ANEClient") + performSelector:@selector(sharedConnection)]; + + printf("=== Compiling test kernels (64x32 FP16 conv) ===\n"); + CompiledKernel k1 = compile_kernel(64, 32); + CompiledKernel k2 = compile_kernel(64, 32); + if (!k1.model || !k2.model) { + printf("FATAL: Kernel compilation failed\n"); + return 1; + } + printf(" k1: hexId=%s\n tmpDir=%s\n", + [k1.hexId UTF8String], [k1.tmpDir UTF8String]); + printf(" k2: hexId=%s\n tmpDir=%s\n", + [k2.hexId UTF8String], [k2.tmpDir UTF8String]); + + // ================================================================= + // EXPERIMENT E: Load _ANEModel from compiled temp directory + // ================================================================= + printf("\n------------------------------------------------------------\n"); + printf(" EXPERIMENT E: Load _ANEModel from compiled temp dir\n"); + printf("------------------------------------------------------------\n"); + + id diskModel1 = nil; + id diskModel2 = nil; + Class gANEModel = NSClassFromString(@"_ANEModel"); + + if (!gANEModel) { + printf(" FATAL: _ANEModel class not found\n"); + } else { + printf("\n --- E.1: Full _ANEModel API surface ---\n"); + dump_class("_ANEModel"); + + printf("\n --- E.2: Temp dir contents (k1) ---\n"); + printf(" Path: %s\n", [k1.tmpDir UTF8String]); + list_dir_recursive(k1.tmpDir, 0); + + printf("\n --- E.3: Factory method probing ---\n"); + + unsigned int mcount; + Method *cmethods = class_copyMethodList( + object_getClass(gANEModel), &mcount); + printf(" All _ANEModel class methods (%u):\n", mcount); + for (unsigned int i = 0; i < mcount; i++) { + printf(" + %s\n", sel_getName(method_getName(cmethods[i]))); + } + free(cmethods); + + NSURL *dirURL1 = [NSURL fileURLWithPath:k1.tmpDir]; + NSString *hexKey1 = k1.hexId; + printf("\n URL: %s\n", [[dirURL1 absoluteString] UTF8String]); + printf(" key (hexId): %s\n", [hexKey1 UTF8String]); + + // E.3a: modelAtURL:key: + printf("\n --- E.3a: modelAtURL:key: ---\n"); + @try { + SEL sel = NSSelectorFromString(@"modelAtURL:key:"); + if ([gANEModel respondsToSelector:sel]) { + diskModel1 = ((id(*)(Class,SEL,id,id))objc_msgSend)( + gANEModel, sel, dirURL1, hexKey1); + printf(" Result: %s\n", + diskModel1 ? [[diskModel1 description] UTF8String] : "nil"); + } else { + printf(" NOT available\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // E.3b: modelAtURL:key:modelAttributes: + if (!diskModel1) { + printf("\n --- E.3b: modelAtURL:key:modelAttributes: ---\n"); + @try { + SEL sel = NSSelectorFromString( + @"modelAtURL:key:modelAttributes:"); + if ([gANEModel respondsToSelector:sel]) { + diskModel1 = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + gANEModel, sel, dirURL1, hexKey1, @{}); + printf(" empty attrs: %s\n", + diskModel1 ? [[diskModel1 description] UTF8String] + : "nil"); + if (!diskModel1) { + diskModel1 = ((id(*)(Class,SEL,id,id,id))objc_msgSend)( + gANEModel, sel, dirURL1, hexKey1, nil); + printf(" nil attrs: %s\n", + diskModel1 ? [[diskModel1 description] UTF8String] + : "nil"); + } + } else { + printf(" NOT available\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // E.3c: modelWithCacheURLIdentifier: + if (!diskModel1) { + printf("\n --- E.3c: modelWithCacheURLIdentifier: ---\n"); + @try { + SEL sel = NSSelectorFromString(@"modelWithCacheURLIdentifier:"); + if ([gANEModel respondsToSelector:sel]) { + diskModel1 = ((id(*)(Class,SEL,id))objc_msgSend)( + gANEModel, sel, hexKey1); + printf(" hexId: %s\n", + diskModel1 ? [[diskModel1 description] UTF8String] + : "nil"); + if (!diskModel1) { + diskModel1 = ((id(*)(Class,SEL,id))objc_msgSend)( + gANEModel, sel, k1.tmpDir); + printf(" tmpDir: %s\n", + diskModel1 ? [[diskModel1 description] UTF8String] + : "nil"); + } + } else { + printf(" NOT available\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // E.3d: modelAtURLWithSourceURL:sourceURL:key:cacheURLIdentifier: + if (!diskModel1) { + printf("\n --- E.3d: modelAtURLWithSourceURL:... ---\n"); + @try { + SEL sel = NSSelectorFromString( + @"modelAtURLWithSourceURL:sourceURL:key:cacheURLIdentifier:"); + if ([gANEModel respondsToSelector:sel]) { + diskModel1 = ((id(*)(Class,SEL,id,id,id,id))objc_msgSend)( + gANEModel, sel, dirURL1, dirURL1, hexKey1, hexKey1); + printf(" Result: %s\n", + diskModel1 ? [[diskModel1 description] UTF8String] + : "nil"); + } else { + printf(" NOT available\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // E.3e: alloc/init variants + if (!diskModel1) { + printf("\n --- E.3e: alloc/init variants ---\n"); + unsigned int imcount; + Method *imethods = class_copyMethodList(gANEModel, &imcount); + printf(" Init-like instance methods:\n"); + for (unsigned int i = 0; i < imcount; i++) { + const char *mname = sel_getName(method_getName(imethods[i])); + if (strstr(mname, "init") || strstr(mname, "Init")) { + printf(" - %s [%s]\n", mname, + method_getTypeEncoding(imethods[i])); + } + } + free(imethods); + + @try { + SEL initSel = NSSelectorFromString(@"initWithURL:key:"); + if ([gANEModel instancesRespondToSelector:initSel]) { + id obj = [gANEModel alloc]; + diskModel1 = ((id(*)(id,SEL,id,id))objc_msgSend)( + obj, initSel, dirURL1, hexKey1); + printf(" initWithURL:key: %s\n", + diskModel1 ? [[diskModel1 description] UTF8String] + : "nil"); + } + } @catch (NSException *ex) { + printf(" initWithURL:key: EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + // E.3f: Search for .hwx files + if (!diskModel1) { + printf("\n --- E.3f: Search for .hwx / .plist files ---\n"); + NSFileManager *fm = [NSFileManager defaultManager]; + NSDirectoryEnumerator *dirEnum = [fm enumeratorAtPath:k1.tmpDir]; + NSString *file; + while ((file = [dirEnum nextObject])) { + NSString *ext = [file pathExtension]; + if ([ext isEqualToString:@"hwx"] || + [ext isEqualToString:@"plist"] || + [ext isEqualToString:@"espresso"]) { + NSString *fp = [k1.tmpDir + stringByAppendingPathComponent:file]; + NSDictionary *attrs = [fm attributesOfItemAtPath:fp + error:nil]; + printf(" Found: %s (%llu bytes)\n", + [file UTF8String], [attrs fileSize]); + } + } + + NSString *netPlist = [k1.tmpDir + stringByAppendingPathComponent:@"net.plist"]; + if ([fm fileExistsAtPath:netPlist]) { + printf(" net.plist found! Reading...\n"); + @try { + NSDictionary *plist = [NSDictionary + dictionaryWithContentsOfFile:netPlist]; + if (plist) { + printf(" net.plist keys: %s\n", + [[[plist allKeys] description] UTF8String]); + } else { + NSData *raw = [NSData dataWithContentsOfFile:netPlist]; + printf(" net.plist: binary (%lu bytes)\n", + (unsigned long)raw.length); + } + } @catch (NSException *ex) { + printf(" net.plist EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } + + // E.3g: Try constructing from programHandle + if (!diskModel1) { + printf("\n --- E.3g: programHandle-based construction ---\n"); + @try { + id progHandle = [k1.model valueForKey:@"programHandle"]; + printf(" k1 programHandle = %s\n", + progHandle ? [[progHandle description] UTF8String] + : "nil"); + unsigned int mct; + Method *cls_m = class_copyMethodList( + object_getClass(gANEModel), &mct); + for (unsigned int i = 0; i < mct; i++) { + const char *mn = sel_getName(method_getName(cls_m[i])); + if (strstr(mn, "Handle") || strstr(mn, "handle") || + strstr(mn, "program") || strstr(mn, "Program")) { + printf(" Relevant factory: +%s\n", mn); + } + } + free(cls_m); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // E.4: If loaded, query critical methods + if (diskModel1) { + printf("\n ======================================================\n"); + printf(" _ANEModel LOADED SUCCESSFULLY!\n"); + printf(" ======================================================\n"); + + printf("\n --- E.4a: All properties ---\n"); + dump_all_properties(diskModel1, gANEModel); + + printf("\n --- E.4b: getUUID ---\n"); + @try { + SEL uuidSel = NSSelectorFromString(@"getUUID"); + if ([diskModel1 respondsToSelector:uuidSel]) { + id uuid = ((id(*)(id,SEL))objc_msgSend)( + diskModel1, uuidSel); + printf(" getUUID: %s\n", + uuid ? [[uuid description] UTF8String] : "nil"); + } else { + printf(" getUUID: NOT available\n"); + } + } @catch (NSException *ex) { + printf(" getUUID EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- E.4c: Symbol indices ---\n"); + @try { + SEL inSel = NSSelectorFromString( + @"inputSymbolIndicesForProcedureIndex:"); + if ([diskModel1 respondsToSelector:inSel]) { + id idx = ((id(*)(id,SEL,NSUInteger))objc_msgSend)( + diskModel1, inSel, (NSUInteger)0); + printf(" inputSymbolIndices(0): %s\n", + idx ? [[idx description] UTF8String] : "nil"); + } + SEL outSel = NSSelectorFromString( + @"outputSymbolIndicesForProcedureIndex:"); + if ([diskModel1 respondsToSelector:outSel]) { + id idx = ((id(*)(id,SEL,NSUInteger))objc_msgSend)( + diskModel1, outSel, (NSUInteger)0); + printf(" outputSymbolIndices(0): %s\n", + idx ? [[idx description] UTF8String] : "nil"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- E.4d: mapper ---\n"); + @try { + id mapper = [diskModel1 valueForKey:@"mapper"]; + printf(" mapper: %s\n", + mapper ? [[mapper description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- E.4e: program ---\n"); + @try { + id prog = [diskModel1 valueForKey:@"program"]; + printf(" program: %s\n", + prog ? [[prog description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // E.4f: Copy programHandle + program from InMemoryModel + printf("\n --- E.4f: Populate _ANEModel from InMemoryModel ---\n"); + @try { + id imProgHandle = [k1.model valueForKey:@"programHandle"]; + id imProgram = [k1.model valueForKey:@"program"]; + id imMapper = nil; + @try { imMapper = [k1.model valueForKey:@"mapper"]; } + @catch (NSException *ex) { (void)ex; } + + printf(" InMemoryModel programHandle: %s\n", + imProgHandle ? [[imProgHandle description] UTF8String] + : "nil"); + printf(" InMemoryModel program: %s\n", + imProgram ? [[imProgram description] UTF8String] + : "nil"); + + if (imProgHandle) { + uint64_t ph = [imProgHandle unsignedLongLongValue]; + ((void(*)(id,SEL,uint64_t))objc_msgSend)( + diskModel1, + @selector(setProgramHandle:), ph); + printf(" Set programHandle on _ANEModel: %llu\n", ph); + } + if (imProgram) { + ((void(*)(id,SEL,id))objc_msgSend)( + diskModel1, @selector(setProgram:), imProgram); + printf(" Set program on _ANEModel\n"); + } + + // Verify + id newPH = [diskModel1 valueForKey:@"programHandle"]; + id newProg = [diskModel1 valueForKey:@"program"]; + printf(" _ANEModel programHandle now: %s\n", + newPH ? [[newPH description] UTF8String] : "nil"); + printf(" _ANEModel program now: %s\n", + newProg ? [[newProg description] UTF8String] : "nil"); + + // Re-check symbol indices after populating + printf("\n Re-checking symbol indices...\n"); + SEL inSel2 = NSSelectorFromString( + @"inputSymbolIndicesForProcedureIndex:"); + if ([diskModel1 respondsToSelector:inSel2]) { + id idx = ((id(*)(id,SEL,unsigned int))objc_msgSend)( + diskModel1, inSel2, (unsigned int)0); + printf(" inputSymbolIndices(0): %s\n", + idx ? [[idx description] UTF8String] : "nil"); + } + SEL outSel2 = NSSelectorFromString( + @"outputSymbolIndicesForProcedureIndex:"); + if ([diskModel1 respondsToSelector:outSel2]) { + id idx = ((id(*)(id,SEL,unsigned int))objc_msgSend)( + diskModel1, outSel2, (unsigned int)0); + printf(" outputSymbolIndices(0): %s\n", + idx ? [[idx description] UTF8String] : "nil"); + } + + // Try getUUID again + id uuid2 = ((id(*)(id,SEL))objc_msgSend)( + diskModel1, NSSelectorFromString(@"getUUID")); + printf(" getUUID after populate: %s\n", + uuid2 ? [[uuid2 description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" Populate EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + // Also load k2 and populate it + printf("\n --- E.5: Loading k2 as _ANEModel ---\n"); + NSURL *dirURL2 = [NSURL fileURLWithPath:k2.tmpDir]; + @try { + SEL sel = NSSelectorFromString(@"modelAtURL:key:"); + if ([gANEModel respondsToSelector:sel]) { + diskModel2 = ((id(*)(Class,SEL,id,id))objc_msgSend)( + gANEModel, sel, dirURL2, k2.hexId); + printf(" k2 _ANEModel: %s\n", + diskModel2 ? "LOADED" : "nil"); + if (diskModel2) { + id k2ph = [k2.model valueForKey:@"programHandle"]; + id k2prog = [k2.model valueForKey:@"program"]; + if (k2ph) { + ((void(*)(id,SEL,uint64_t))objc_msgSend)( + diskModel2, @selector(setProgramHandle:), + [k2ph unsignedLongLongValue]); + } + if (k2prog) { + ((void(*)(id,SEL,id))objc_msgSend)( + diskModel2, @selector(setProgram:), + k2prog); + } + printf(" k2 populated with programHandle + program\n"); + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf("\n _ANEModel could NOT be loaded via any factory.\n"); + printf(" Proceeding to Experiment E2 (ANECompiler).\n"); + } + } + + // ================================================================= + // EXPERIMENT E2: ANECompiler probing + // ================================================================= + printf("\n------------------------------------------------------------\n"); + printf(" EXPERIMENT E2: ANECompiler / model.hwx generation\n"); + printf("------------------------------------------------------------\n"); + { + printf("\n --- E2.1: Looking for ANECompiler ---\n"); + const char *compiler_paths[] = { + "/System/Library/PrivateFrameworks/" + "ANECompiler.framework/ANECompiler", + "/System/Library/PrivateFrameworks/" + "ANECompiler.framework/Versions/Current/ANECompiler", + "/System/Library/Frameworks/" + "CoreML.framework/Versions/A/CoreML", + NULL + }; + void *compilerHandle = NULL; + for (int i = 0; compiler_paths[i]; i++) { + compilerHandle = dlopen(compiler_paths[i], RTLD_NOW); + if (compilerHandle) { + printf(" Found: %s\n", compiler_paths[i]); + break; + } else { + printf(" Not at: %s\n", compiler_paths[i]); + } + } + + printf("\n --- E2.2: Compiler class search ---\n"); + const char *compiler_classes[] = { + "ANECompiler", "_ANECompiler", "ANECompilerService", + "_ANECompilerService", "ANECompileOptions", + "_ANECompileOptions", "ANEModelCompiler", + "_ANEModelCompiler", "ANECCompiler", NULL + }; + for (int i = 0; compiler_classes[i]; i++) { + Class cls = NSClassFromString( + [NSString stringWithUTF8String:compiler_classes[i]]); + if (cls) { + printf(" FOUND: %s\n", compiler_classes[i]); + dump_class(compiler_classes[i]); + } else { + printf(" %s: not found\n", compiler_classes[i]); + } + } + + printf("\n --- E2.3: Compile with debug_mask ---\n"); + @try { + Class gD = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class gI = NSClassFromString(@"_ANEInMemoryModel"); + int dbg_ch = 32, dbg_ws = dbg_ch * dbg_ch * 2; + int dbg_tot = 128 + dbg_ws; + uint8_t *dbg_blob = (uint8_t *)calloc((size_t)dbg_tot, 1); + dbg_blob[0] = 1; dbg_blob[4] = 2; + dbg_blob[64] = 0xEF; dbg_blob[65] = 0xBE; + dbg_blob[66] = 0xAD; dbg_blob[67] = 0xDE; + dbg_blob[68] = 1; + *(uint32_t *)(dbg_blob + 72) = (uint32_t)dbg_ws; + *(uint32_t *)(dbg_blob + 80) = 128; + _Float16 *dbg_wp = (_Float16 *)(dbg_blob + 128); + for (int i = 0; i < dbg_ch; i++) + dbg_wp[i * dbg_ch + i] = (_Float16)1.0f; + NSData *wdata = [NSData dataWithBytesNoCopy:dbg_blob + length:(NSUInteger)dbg_tot freeWhenDone:YES]; + + NSString *mil = gen_conv_mil(32, 16); + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(gD, + @selector(modelWithMILText:weights:optionsPlist:), + md, + @{@"@model_path/weights/weight.bin": + @{@"offset":@0, @"data":wdata}}, + nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(gI, + @selector(inMemoryModelWithDescriptor:), desc); + + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, + @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() + stringByAppendingPathComponent: + [NSString stringWithFormat:@"debug_%@", hx]]; + NSFileManager *fm = [NSFileManager defaultManager]; + [fm createDirectoryAtPath: + [td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] + atomically:YES]; + [wdata writeToFile: + [td stringByAppendingPathComponent:@"weights/weight.bin"] + atomically:YES]; + + NSDictionary *debugOpts = @{ + @"debug_mask": @(INT_MAX), + @"ANEDebugMask": @(INT_MAX), + @"ane_debug_mask": @(INT_MAX), + }; + printf(" Compiling with debug_mask=%d...\n", INT_MAX); + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**)) + objc_msgSend)(mdl, + @selector(compileWithQoS:options:error:), + 21, debugOpts, &e); + printf(" Compile: %s\n", ok ? "SUCCESS" : "FAILED"); + if (!ok && e) + printf(" Error: %s\n", [[e description] UTF8String]); + + if (ok) { + printf(" Temp dir after debug compile:\n"); + list_dir_recursive(td, 0); + + NSDirectoryEnumerator *de = [fm enumeratorAtPath:td]; + NSString *f; + int hwxCount = 0; + while ((f = [de nextObject])) { + if ([[f pathExtension] isEqualToString:@"hwx"]) + hwxCount++; + } + if (hwxCount > 0) + printf(" Found %d .hwx file(s)!\n", hwxCount); + + if (gANEModel && !diskModel1) { + @try { + SEL sel = NSSelectorFromString(@"modelAtURL:key:"); + if ([gANEModel respondsToSelector:sel]) { + NSURL *dURL = [NSURL fileURLWithPath:td]; + diskModel1 = ((id(*)(Class,SEL,id,id)) + objc_msgSend)(gANEModel, sel, dURL, hx); + printf(" modelAtURL:key: on debug dir: %s\n", + diskModel1 + ? [[diskModel1 description] UTF8String] + : "nil"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } + [fm removeItemAtPath:td error:nil]; + } @catch (NSException *ex) { + printf(" E2.3 EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- E2.4: ane_compiler_service search ---\n"); + { + NSFileManager *fm = [NSFileManager defaultManager]; + const char *svc_paths[] = { + "/usr/libexec/ane_compiler_service", + "/System/Library/CoreServices/ane_compiler_service", + "/System/Library/PrivateFrameworks/" + "ANECompiler.framework/ane_compiler_service", + NULL + }; + for (int i = 0; svc_paths[i]; i++) { + NSString *p = [NSString stringWithUTF8String:svc_paths[i]]; + printf(" %s: %s\n", svc_paths[i], + [fm fileExistsAtPath:p] ? "FOUND" : "not found"); + } + } + + printf("\n --- E2.5: _ANEInMemoryModel compilation methods ---\n"); + { + Class gI = NSClassFromString(@"_ANEInMemoryModel"); + unsigned int mc; + Method *ms = class_copyMethodList(gI, &mc); + for (unsigned int i = 0; i < mc; i++) { + const char *mn = sel_getName(method_getName(ms[i])); + if (strstr(mn, "compile") || strstr(mn, "Compile") || + strstr(mn, "hwx") || strstr(mn, "HWX") || + strstr(mn, "binary") || strstr(mn, "Binary") || + strstr(mn, "save") || strstr(mn, "Save") || + strstr(mn, "export") || strstr(mn, "Export") || + strstr(mn, "path") || strstr(mn, "Path") || + strstr(mn, "url") || strstr(mn, "URL") || + strstr(mn, "temp") || strstr(mn, "Temp") || + strstr(mn, "cache") || strstr(mn, "Cache")) { + printf(" - %s [%s]\n", mn, + method_getTypeEncoding(ms[i])); + } + } + free(ms); + } + } + + // ================================================================= + // EXPERIMENT F: Full chaining pipeline with _ANEModel + // ================================================================= + printf("\n------------------------------------------------------------\n"); + printf(" EXPERIMENT F: Full chaining pipeline\n"); + printf("------------------------------------------------------------\n"); + { + if (!diskModel1) { + printf(" SKIPPED: _ANEModel not loaded\n"); + printf(" Fallback: prepareChainingWithModel on InMemoryModel\n\n"); + + @try { + id ioObj1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id buf1 = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + ioObj1, @0, (long long)0); + + id outIO1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id outBuf1 = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + outIO1, @0, (long long)1); + + IOSurfaceRef statsSurf = make_surface(64); + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id))objc_msgSend)( + gOutSets, + @selector(objectWithstatsSurRef:outputBuffer:), + statsSurf, @[outBuf1]); + + id chainReq = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId:procedureIndex: + signalEvents:transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[buf1], @[outSet], @(-1), @(-1), @0, + @[], @0, @0, @0); + + if (chainReq) { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)( + chainReq, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(prepareChainingWithModel:options: + chainingReq:qos:error:), + k1.model, @{}, chainReq, (unsigned int)21, + &prepErr); + printf(" prepareChainingWithModel (InMemory): %s\n", + prepOk ? "YES" : "NO"); + if (!prepOk && prepErr) + printf(" Error: %s\n", + [[prepErr description] UTF8String]); + } + CFRelease(statsSurf); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + printf(" (Expected: getUUID unrecognized selector)\n"); + } + } else { + printf(" Using _ANEModel for chaining!\n\n"); + + NSArray *inputSymbols = nil; + NSArray *outputSymbols = nil; + @try { + SEL inSel = NSSelectorFromString( + @"inputSymbolIndicesForProcedureIndex:"); + inputSymbols = ((id(*)(id,SEL,NSUInteger))objc_msgSend)( + diskModel1, inSel, (NSUInteger)0); + SEL outSel = NSSelectorFromString( + @"outputSymbolIndicesForProcedureIndex:"); + outputSymbols = ((id(*)(id,SEL,NSUInteger))objc_msgSend)( + diskModel1, outSel, (NSUInteger)0); + printf(" Input symbols: %s\n", + inputSymbols ? [[inputSymbols description] UTF8String] + : "nil"); + printf(" Output symbols: %s\n", + outputSymbols ? [[outputSymbols description] UTF8String] + : "nil"); + } @catch (NSException *ex) { + printf(" Symbol query EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + @try { + NSNumber *inSymIdx = (inputSymbols.count > 0) + ? inputSymbols[0] : @0; + NSNumber *outSymIdx = (outputSymbols.count > 0) + ? outputSymbols[0] : @0; + + id ioObj1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id inBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + ioObj1, inSymIdx, (long long)0); + + id outIO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id outBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + outIO, outSymIdx, (long long)1); + + IOSurfaceRef statsSurf = make_surface(64); + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id))objc_msgSend)( + gOutSets, + @selector(objectWithstatsSurRef:outputBuffer:), + statsSurf, @[outBuf]); + + id chainReq = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId:procedureIndex: + signalEvents:transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[inBuf], @[outSet], @(-1), @(-1), @0, + @[], @0, @0, @0); + + if (chainReq) { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)( + chainReq, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(prepareChainingWithModel:options: + chainingReq:qos:error:), + diskModel1, @{}, chainReq, (unsigned int)21, + &prepErr); + printf(" prepareChainingWithModel: %s\n", + prepOk ? "YES" : "NO"); + if (!prepOk && prepErr) + printf(" Error: %s\n", + [[prepErr description] UTF8String]); + + if (prepOk) { + printf(" CHAINING PREPARE SUCCEEDED!\n"); + + @try { + NSError *enqErr = nil; + BOOL enqOk = ((BOOL(*)(id,SEL,id,id,id, + unsigned int,NSError**))objc_msgSend)( + client, + @selector(enqueueSetsWithModel:outputSet: + options:qos:error:), + diskModel1, outSet, @{}, + (unsigned int)21, &enqErr); + printf(" enqueueSets: %s\n", + enqOk ? "YES" : "NO"); + if (!enqOk && enqErr) + printf(" Error: %s\n", + [[enqErr description] UTF8String]); + } @catch (NSException *ex) { + printf(" enqueueSets EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + @try { + NSError *rdyErr = nil; + BOOL rdyOk = ((BOOL(*)(id,SEL,id,id,id, + unsigned int,NSError**))objc_msgSend)( + client, + @selector(buffersReadyWithModel: + inputBuffers:options:qos:error:), + diskModel1, @[inBuf], @{}, + (unsigned int)21, &rdyErr); + printf(" buffersReady: %s\n", + rdyOk ? "YES" : "NO"); + if (!rdyOk && rdyErr) + printf(" Error: %s\n", + [[rdyErr description] UTF8String]); + } @catch (NSException *ex) { + printf(" buffersReady EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + // Benchmark sequential baseline + printf("\n --- Benchmark ---\n"); + id wI = ((id(*)(Class,SEL,IOSurfaceRef)) + objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioIn); + id wO = ((id(*)(Class,SEL,IOSurfaceRef)) + objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id)) + objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices: + outputs:outputIndices:weightsBuffer: + perfStats:procedureIndex:), + @[wI], @[@0], @[wO], @[@0], nil, nil, @0); + + int iters = 100; + NSError *seqErr = nil; + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < iters; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id, + NSError**))objc_msgSend)(k1.model, + @selector(evaluateWithQoS:options: + request:error:), + 21, @{}, req, &seqErr); + } + double seqMs = tb_ms( + mach_absolute_time() - t0) / iters; + printf(" Sequential: %.3f ms/kernel\n", seqMs); + } + } + CFRelease(statsSurf); + } @catch (NSException *ex) { + printf(" Chaining EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } + + // ================================================================= + // EXPERIMENT G: IOSurfaceSharedEvent / hardware fences + // ================================================================= + printf("\n------------------------------------------------------------\n"); + printf(" EXPERIMENT G: IOSurfaceSharedEvent / hardware fences\n"); + printf("------------------------------------------------------------\n"); + { + Class gSigEvent = NSClassFromString(@"_ANESharedSignalEvent"); + Class gWaitEvent = NSClassFromString(@"_ANESharedWaitEvent"); + + printf("\n --- G.1: Event class API ---\n"); + if (gSigEvent) dump_class("_ANESharedSignalEvent"); + else printf(" _ANESharedSignalEvent: NOT FOUND\n"); + if (gWaitEvent) dump_class("_ANESharedWaitEvent"); + else printf(" _ANESharedWaitEvent: NOT FOUND\n"); + + printf("\n --- G.2: MTLSharedEvent via Metal ---\n"); + @try { + void *metalH = dlopen( + "/System/Library/Frameworks/Metal.framework/Metal", + RTLD_NOW); + if (metalH) { + id (*createDev)(void) = dlsym(metalH, + "MTLCreateSystemDefaultDevice"); + if (createDev) { + id dev = createDev(); + printf(" MTLDevice: %s\n", + dev ? [[dev description] UTF8String] : "nil"); + + if (dev) { + SEL newEvt = NSSelectorFromString( + @"newSharedEvent"); + if ([dev respondsToSelector:newEvt]) { + id shEvt = ((id(*)(id,SEL))objc_msgSend)( + dev, newEvt); + printf(" MTLSharedEvent: %s\n", + shEvt ? [[shEvt description] UTF8String] + : "nil"); + + if (shEvt && gSigEvent) { + printf("\n --- G.3: _ANESharedSignalEvent " + "with MTLSharedEvent ---\n"); + // Factory: (Q16 I24 q28 @36) = + // (uint64_t, unsigned int, long long, id) + @try { + SEL sigSel = NSSelectorFromString( + @"signalEventWithValue:symbolIndex:" + "eventType:sharedEvent:"); + if ([gSigEvent respondsToSelector:sigSel]) { + for (int et = 0; et <= 2; et++) { + id se = ((id(*)(Class,SEL, + uint64_t,unsigned int, + long long,id)) + objc_msgSend)(gSigEvent, + sigSel, (uint64_t)1, + (unsigned int)0, + (long long)et, shEvt); + printf(" eventType=%d: %s\n", + et, + se ? [[se description] + UTF8String] : "nil"); + if (se) { + dump_all_properties( + se, gSigEvent); + } + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + if (shEvt && gWaitEvent) { + printf("\n --- G.4: _ANESharedWaitEvent " + "with MTLSharedEvent ---\n"); + // Factory: waitEventWithValue:sharedEvent: + // (Q16 @24) = (uint64_t, id) + // Factory: waitEventWithValue:sharedEvent:eventType: + // (Q16 @24 Q32) = (uint64_t, id, uint64_t) + @try { + SEL wSel = NSSelectorFromString( + @"waitEventWithValue:sharedEvent:"); + if ([gWaitEvent respondsToSelector:wSel]) { + id we = ((id(*)(Class,SEL, + uint64_t,id)) + objc_msgSend)(gWaitEvent, + wSel, (uint64_t)1, shEvt); + printf(" waitEvent(2-param): %s\n", + we ? [[we description] + UTF8String] : "nil"); + if (we) + dump_all_properties( + we, gWaitEvent); + } + } @catch (NSException *ex) { + printf(" 2-param EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + @try { + SEL wSel3 = NSSelectorFromString( + @"waitEventWithValue:" + "sharedEvent:eventType:"); + if ([gWaitEvent respondsToSelector:wSel3]) { + id we = ((id(*)(Class,SEL, + uint64_t,id,uint64_t)) + objc_msgSend)(gWaitEvent, + wSel3, (uint64_t)1, shEvt, + (uint64_t)0); + printf(" waitEvent(3-param): %s\n", + we ? [[we description] + UTF8String] : "nil"); + if (we) + dump_all_properties( + we, gWaitEvent); + } + } @catch (NSException *ex) { + printf(" 3-param EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } + } + } + } + } @catch (NSException *ex) { + printf(" Metal EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- G.5: IOSurfaceSharedEventCreate ---\n"); + @try { + void *iosH = dlopen( + "/System/Library/Frameworks/" + "IOSurface.framework/IOSurface", RTLD_NOW); + if (iosH) { + typedef id (*CreateFunc)(void); + CreateFunc fn = dlsym(iosH, "IOSurfaceSharedEventCreate"); + if (fn) { + id iosEvt = fn(); + printf(" IOSurfaceSharedEventCreate: %s\n", + iosEvt ? [[iosEvt description] UTF8String] + : "nil"); + + // Try using IOSurfaceSharedEvent with signal/wait + if (iosEvt && gSigEvent) { + printf("\n G.5b: SignalEvent with IOSurfaceSharedEvent\n"); + @try { + SEL sigSel = NSSelectorFromString( + @"signalEventWithValue:symbolIndex:" + "eventType:sharedEvent:"); + id se = ((id(*)(Class,SEL,uint64_t, + unsigned int,long long,id)) + objc_msgSend)(gSigEvent, sigSel, + (uint64_t)1, (unsigned int)0, + (long long)0, iosEvt); + printf(" signalEvent: %s\n", + se ? [[se description] UTF8String] + : "nil"); + if (se) dump_all_properties(se, gSigEvent); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + if (iosEvt && gWaitEvent) { + printf("\n G.5c: WaitEvent with IOSurfaceSharedEvent\n"); + @try { + SEL wSel = NSSelectorFromString( + @"waitEventWithValue:sharedEvent:"); + id we = ((id(*)(Class,SEL,uint64_t,id)) + objc_msgSend)(gWaitEvent, wSel, + (uint64_t)1, iosEvt); + printf(" waitEvent: %s\n", + we ? [[we description] UTF8String] + : "nil"); + if (we) dump_all_properties(we, gWaitEvent); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } else { + printf(" IOSurfaceSharedEventCreate: not found\n"); + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // ================================================================= + // EXPERIMENT H: Alternative chaining preparation + // ================================================================= + printf("\n------------------------------------------------------------\n"); + printf(" EXPERIMENT H: Alternative chaining preparation\n"); + printf("------------------------------------------------------------\n"); + { + printf("\n --- H.1: _ANEClient chaining methods ---\n"); + Class clientCls = NSClassFromString(@"_ANEClient"); + if (clientCls) { + unsigned int mc; + Method *ms = class_copyMethodList(clientCls, &mc); + for (unsigned int i = 0; i < mc; i++) { + const char *mn = sel_getName(method_getName(ms[i])); + if (strstr(mn, "chain") || strstr(mn, "Chain") || + strstr(mn, "prepare") || strstr(mn, "Prepare") || + strstr(mn, "enqueue") || strstr(mn, "Enqueue") || + strstr(mn, "buffer") || strstr(mn, "Buffer") || + strstr(mn, "ready") || strstr(mn, "Ready") || + strstr(mn, "pipeline") || strstr(mn, "Pipeline") || + strstr(mn, "batch") || strstr(mn, "Batch") || + strstr(mn, "async") || strstr(mn, "Async")) { + printf(" - %s [%s]\n", mn, + method_getTypeEncoding(ms[i])); + } + } + free(ms); + + printf("\n --- H.2: doPrepareChainingWithModel ---\n"); + SEL doPrep = NSSelectorFromString( + @"doPrepareChainingWithModel:options:" + "chainingReq:qos:error:"); + if ([client respondsToSelector:doPrep]) { + printf(" doPrepareChainingWithModel EXISTS\n"); + + @try { + id ioObj = ((id(*)(Class,SEL,IOSurfaceRef)) + objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioIn); + id buf = ((id(*)(Class,SEL,id,id,long long)) + objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject: + symbolIndex:source:), + ioObj, @0, (long long)0); + id outIO = ((id(*)(Class,SEL,IOSurfaceRef)) + objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioOut); + id outBuf = ((id(*)(Class,SEL,id,id,long long)) + objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject: + symbolIndex:source:), + outIO, @0, (long long)1); + IOSurfaceRef ss = make_surface(64); + id os = ((id(*)(Class,SEL,IOSurfaceRef,id)) + objc_msgSend)(gOutSets, + @selector(objectWithstatsSurRef:outputBuffer:), + ss, @[outBuf]); + id cr = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId: + procedureIndex:signalEvents: + transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[buf], @[os], @(-1), @(-1), @0, + @[], @0, @0, @0); + + NSError *err = nil; + printf(" With _ANEInMemoryModel...\n"); + BOOL ok = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, doPrep, + k1.model, @{}, cr, (unsigned int)21, &err); + printf(" Result: %s\n", ok ? "YES" : "NO"); + if (!ok && err) + printf(" Error: %s\n", + [[err description] UTF8String]); + CFRelease(ss); + } @catch (NSException *ex) { + printf(" InMemory EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + if (diskModel1) { + @try { + id ioObj = ((id(*)(Class,SEL,IOSurfaceRef)) + objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioIn); + id buf = ((id(*)(Class,SEL,id,id,long long)) + objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject: + symbolIndex:source:), + ioObj, @0, (long long)0); + id outIO = ((id(*)(Class,SEL,IOSurfaceRef)) + objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioOut); + id outBuf = ((id(*)(Class,SEL,id,id,long long)) + objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject: + symbolIndex:source:), + outIO, @0, (long long)1); + IOSurfaceRef ss = make_surface(64); + id os = ((id(*)(Class,SEL,IOSurfaceRef,id)) + objc_msgSend)(gOutSets, + @selector(objectWithstatsSurRef:outputBuffer:), + ss, @[outBuf]); + id cr = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId: + procedureIndex:signalEvents: + transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[buf], @[os], @(-1), @(-1), @0, + @[], @0, @0, @0); + + NSError *err = nil; + printf(" With _ANEModel...\n"); + BOOL ok = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, doPrep, + diskModel1, @{}, cr, (unsigned int)21, &err); + printf(" Result: %s\n", ok ? "YES" : "NO"); + if (!ok && err) + printf(" Error: %s\n", + [[err description] UTF8String]); + CFRelease(ss); + } @catch (NSException *ex) { + printf(" ANEModel EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } else { + printf(" NOT available\n"); + } + + printf("\n --- H.3: All _ANEClient methods ---\n"); + unsigned int allC; + Method *allM = class_copyMethodList(clientCls, &allC); + printf(" Total: %u\n", allC); + for (unsigned int i = 0; i < allC; i++) { + printf(" - %s\n", + sel_getName(method_getName(allM[i]))); + } + free(allM); + } + } + + // ================================================================= + // Experiment K: ChainingRequest Factory Type Encoding Analysis + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment K: Type Encoding Analysis\n"); + printf("==============================================================\n\n"); + { + Class clientCls = object_getClass(client); + + SEL chainFactorySel = @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId:procedureIndex: + signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:); + Method chainFactory = class_getClassMethod(gChain, chainFactorySel); + if (chainFactory) { + const char *enc = method_getTypeEncoding(chainFactory); + printf(" ChainingRequest factory encoding: %s\n", enc ? enc : "nil"); + + if (enc) { + const char *paramNames[] = { + "return", "self", "_cmd", + "inputs", "outputSets", "lbInputSymbolId", + "lbOutputSymbolId", "procedureIndex", "signalEvents", + "transactionHandle", "fwEnqueueDelay", "memoryPoolId" + }; + unsigned int nargs = method_getNumberOfArguments(chainFactory); + printf(" Number of arguments: %u\n", nargs); + for (unsigned int i = 0; i < nargs && i < 12; i++) { + char argType[64] = {0}; + method_getArgumentType(chainFactory, i, argType, sizeof(argType)); + printf(" arg[%u] %-20s = %s", i, paramNames[i], argType); + if (argType[0] == '@') printf(" (id/object)"); + else if (argType[0] == 'q') printf(" (int64_t)"); + else if (argType[0] == 'Q') printf(" (uint64_t)"); + else if (argType[0] == 'i') printf(" (int32_t)"); + else if (argType[0] == 'I') printf(" (uint32_t)"); + else if (argType[0] == 'B') printf(" (BOOL)"); + else if (argType[0] == 'v') printf(" (void)"); + else if (argType[0] == ':') printf(" (SEL)"); + printf("\n"); + } + } + } else { + printf(" ChainingRequest factory: NOT FOUND\n"); + } + + SEL prepSel = @selector(prepareChainingWithModel:options: + chainingReq:qos:error:); + Method prepMethod = class_getInstanceMethod(clientCls, prepSel); + if (!prepMethod) + prepMethod = class_getInstanceMethod( + NSClassFromString(@"_ANEClient"), prepSel); + if (prepMethod) { + const char *enc = method_getTypeEncoding(prepMethod); + printf("\n prepareChainingWithModel encoding: %s\n", + enc ? enc : "nil"); + if (enc) { + const char *pNames[] = { + "return", "self", "_cmd", + "model", "options", "chainingReq", "qos", "error" + }; + unsigned int nargs = method_getNumberOfArguments(prepMethod); + printf(" Number of arguments: %u\n", nargs); + for (unsigned int i = 0; i < nargs && i < 8; i++) { + char argType[64] = {0}; + method_getArgumentType(prepMethod, i, argType, sizeof(argType)); + printf(" arg[%u] %-15s = %s", i, pNames[i], argType); + if (argType[0] == '@') printf(" (id/object)"); + else if (argType[0] == 'q') printf(" (int64_t)"); + else if (argType[0] == 'Q') printf(" (uint64_t)"); + else if (argType[0] == 'I') printf(" (uint32_t)"); + else if (argType[0] == 'B') printf(" (BOOL)"); + printf("\n"); + } + } + } else { + printf("\n prepareChainingWithModel: NOT FOUND\n"); + } + + SEL doPrepSel = NSSelectorFromString( + @"doPrepareChainingWithModel:options:chainingReq:qos:error:"); + Method doPrepMethod = class_getInstanceMethod( + NSClassFromString(@"_ANEClient"), doPrepSel); + if (doPrepMethod) { + const char *enc = method_getTypeEncoding(doPrepMethod); + printf("\n doPrepareChainingWithModel encoding: %s\n", + enc ? enc : "nil"); + unsigned int nargs = method_getNumberOfArguments(doPrepMethod); + printf(" Number of arguments: %u\n", nargs); + for (unsigned int i = 0; i < nargs; i++) { + char argType[64] = {0}; + method_getArgumentType(doPrepMethod, i, argType, sizeof(argType)); + printf(" arg[%u] = %s\n", i, argType); + } + } + + printf("\n --- K.2: All _ANEChainingRequest methods type encodings ---\n"); + { + unsigned int mc; + Method *cms = class_copyMethodList(object_getClass(gChain), &mc); + printf(" Class methods (%u):\n", mc); + for (unsigned int i = 0; i < mc; i++) { + const char *name = sel_getName(method_getName(cms[i])); + const char *enc = method_getTypeEncoding(cms[i]); + printf(" + %s\n encoding: %s\n", name, enc ? enc : "?"); + } + free(cms); + + Method *ims = class_copyMethodList(gChain, &mc); + printf(" Instance methods (%u):\n", mc); + for (unsigned int i = 0; i < mc; i++) { + const char *name = sel_getName(method_getName(ims[i])); + const char *enc = method_getTypeEncoding(ims[i]); + printf(" - %s\n encoding: %s\n", name, enc ? enc : "?"); + } + free(ims); + } + } + + // ================================================================= + // Experiment L: Array-Typed ChainingRequest Parameters + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment L: Array-Typed ChainingRequest Parameters\n"); + printf("==============================================================\n\n"); + BOOL chainingPrepSuccess = NO; + id bestChainReq = nil; + id bestModel = diskModel1 ? diskModel1 : k1.model; + { + id ioObj = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id inBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + ioObj, @0, (long long)0); + + id outIO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id outBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + outIO, @0, (long long)1); + + IOSurfaceRef statsSurf = make_surface(64); + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id))objc_msgSend)( + gOutSets, @selector(objectWithstatsSurRef:outputBuffer:), + statsSurf, @[outBuf]); + + struct { + const char *label; + id lbIn; id lbOut; id procIdx; + } combos[] = { + { "arrays @[@(-1)]", @[@(-1)], @[@(-1)], @[@0] }, + { "arrays @[@0]", @[@0], @[@0], @[@0] }, + { "empty arrays @[]", @[], @[], @[] }, + { "nil values", nil, nil, nil }, + { "original NSNumber", @(-1), @(-1), @0 }, + }; + int ncombos = sizeof(combos)/sizeof(combos[0]); + + for (int ci = 0; ci < ncombos; ci++) { + printf(" --- L.%d: %s ---\n", ci+1, combos[ci].label); + @try { + id cr = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId:procedureIndex: + signalEvents:transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[inBuf], @[outSet], + combos[ci].lbIn, combos[ci].lbOut, combos[ci].procIdx, + @[], @0, @0, @0); + + if (!cr) { + printf(" ChainingRequest: nil\n\n"); + continue; + } + + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)( + cr, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + printf(" desc: %s\n", + [[cr description] UTF8String]); + + @try { + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(prepareChainingWithModel:options: + chainingReq:qos:error:), + bestModel, @{}, cr, (unsigned int)21, &prepErr); + printf(" prepareChainingWithModel: %s\n", + prepOk ? "YES" : "NO"); + if (prepErr) + printf(" Error: %s\n", + [[prepErr description] UTF8String]); + if (prepOk || !prepErr) { + chainingPrepSuccess = prepOk; + bestChainReq = cr; + printf(" *** GOT PAST THE CRASH! ***\n"); + } + } @catch (NSException *prepEx) { + printf(" prepare EXCEPTION: %s\n", + [[prepEx reason] UTF8String]); + } + } @catch (NSException *ex) { + printf(" factory EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + printf("\n"); + } + + CFRelease(statsSurf); + } + + // ================================================================= + // Experiment M: Load Model via _ANEClient + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment M: Load Model via _ANEClient\n"); + printf("==============================================================\n\n"); + id fullyLoadedModel = nil; + { + if (!diskModel1) { + printf(" SKIPPED: no _ANEModel from Experiment E\n"); + } else { + @try { + id st = [diskModel1 valueForKey:@"state"]; + printf(" diskModel1 state before: %s\n", + st ? [[st description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" state query exception: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- M.1: compiledModelExistsFor: ---\n"); + @try { + SEL existsSel = NSSelectorFromString( + @"compiledModelExistsFor:"); + if ([client respondsToSelector:existsSel]) { + BOOL exists = ((BOOL(*)(id,SEL,id))objc_msgSend)( + client, existsSel, diskModel1); + printf(" compiledModelExistsFor: %s\n", + exists ? "YES" : "NO"); + } else { + printf(" compiledModelExistsFor: NOT AVAILABLE\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- M.2: loadModel:options:qos:error: ---\n"); + @try { + SEL loadSel = NSSelectorFromString( + @"loadModel:options:qos:error:"); + if ([client respondsToSelector:loadSel]) { + NSError *loadErr = nil; + BOOL loadOk = ((BOOL(*)(id,SEL,id,id,unsigned int, + NSError**))objc_msgSend)(client, loadSel, + diskModel1, @{}, (unsigned int)21, &loadErr); + printf(" loadModel: %s\n", loadOk ? "YES" : "NO"); + if (loadErr) + printf(" Error: %s\n", + [[loadErr description] UTF8String]); + + if (loadOk) { + fullyLoadedModel = diskModel1; + @try { + SEL inSel = NSSelectorFromString( + @"inputSymbolIndicesForProcedureIndex:"); + id inSyms = ((id(*)(id,SEL,NSUInteger)) + objc_msgSend)(diskModel1, inSel, 0); + SEL outSel = NSSelectorFromString( + @"outputSymbolIndicesForProcedureIndex:"); + id outSyms = ((id(*)(id,SEL,NSUInteger)) + objc_msgSend)(diskModel1, outSel, 0); + printf(" After load - inputSymbols: %s\n", + inSyms ? [[inSyms description] UTF8String] + : "nil/empty"); + printf(" After load - outputSymbols: %s\n", + outSyms ? [[outSyms description] UTF8String] + : "nil/empty"); + } @catch (NSException *ex) { + printf(" Symbol query EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } else { + printf(" loadModel: NOT AVAILABLE\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- M.3: compileModel:options:qos:error: ---\n"); + @try { + SEL compileSel = NSSelectorFromString( + @"compileModel:options:qos:error:"); + if ([client respondsToSelector:compileSel]) { + NSError *compErr = nil; + BOOL compOk = ((BOOL(*)(id,SEL,id,id,unsigned int, + NSError**))objc_msgSend)(client, compileSel, + diskModel1, @{}, (unsigned int)21, &compErr); + printf(" compileModel: %s\n", compOk ? "YES" : "NO"); + if (compErr) + printf(" Error: %s\n", + [[compErr description] UTF8String]); + + if (compOk) { + fullyLoadedModel = diskModel1; + @try { + id inSyms = ((id(*)(id,SEL,NSUInteger)) + objc_msgSend)(diskModel1, + NSSelectorFromString( + @"inputSymbolIndicesForProcedureIndex:"), + 0); + id outSyms = ((id(*)(id,SEL,NSUInteger)) + objc_msgSend)(diskModel1, + NSSelectorFromString( + @"outputSymbolIndicesForProcedureIndex:"), + 0); + printf(" After compile - inputSymbols: %s\n", + inSyms ? [[inSyms description] UTF8String] + : "nil/empty"); + printf(" After compile - outputSymbols: %s\n", + outSyms ? [[outSyms description] UTF8String] + : "nil/empty"); + } @catch (NSException *ex) { + printf(" Symbol query EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } else { + printf(" compileModel: NOT AVAILABLE\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + @try { + id st = [diskModel1 valueForKey:@"state"]; + printf("\n diskModel1 state after: %s\n", + st ? [[st description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf("\n state query exception: %s\n", + [[ex reason] UTF8String]); + } + } + } + + // ================================================================= + // Experiment N: IOSurface Mapping via _ANEProgramIOSurfacesMapper + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment N: IOSurface Mapping\n"); + printf("==============================================================\n\n"); + { + Class gMapper = NSClassFromString(@"_ANEProgramIOSurfacesMapper"); + if (!gMapper) { + printf(" _ANEProgramIOSurfacesMapper: NOT FOUND\n"); + } else { + printf(" _ANEProgramIOSurfacesMapper: FOUND\n"); + dump_class("_ANEProgramIOSurfacesMapper"); + + id progHandle = nil; + @try { + progHandle = [k1.model valueForKey:@"programHandle"]; + } @catch (NSException *ex) { + printf(" programHandle exception: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- N.1: mapperWithProgramHandle: ---\n"); + id mapper = nil; + if (progHandle) { + uint64_t ph = [progHandle unsignedLongLongValue]; + printf(" programHandle = %llu\n", ph); + @try { + SEL mapperSel = NSSelectorFromString( + @"mapperWithProgramHandle:"); + mapper = ((id(*)(Class,SEL,uint64_t))objc_msgSend)( + gMapper, mapperSel, ph); + printf(" mapper created: %s\n", + mapper ? [[mapper description] UTF8String] + : "nil"); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + id targetModel = diskModel1 ? diskModel1 : k1.model; + const char *modelType = diskModel1 ? "_ANEModel" : "InMemoryModel"; + + if (mapper) { + printf("\n --- N.2: mapIOSurfacesWithModel: (%s) ---\n", + modelType); + @try { + id ioObj = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id outIO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id)) + objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices: + outputs:outputIndices:weightsBuffer: + perfStats:procedureIndex:), + @[ioObj], @[@0], @[outIO], @[@0], nil, nil, @0); + + if (req) { + SEL mapSel = NSSelectorFromString( + @"mapIOSurfacesWithModel:request:" + "cacheInference:error:"); + NSError *mapErr = nil; + BOOL mapOk = ((BOOL(*)(id,SEL,id,id,BOOL, + NSError**))objc_msgSend)(mapper, mapSel, + targetModel, req, NO, &mapErr); + printf(" mapIOSurfaces: %s\n", + mapOk ? "YES" : "NO"); + if (mapErr) + printf(" Error: %s\n", + [[mapErr description] UTF8String]); + + if (mapOk) { + dump_all_properties(mapper, + [mapper class]); + } + } else { + printf(" Request creation failed\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- N.3: validateRequest:model: ---\n"); + @try { + id ioObj = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id outIO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id)) + objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices: + outputs:outputIndices:weightsBuffer: + perfStats:procedureIndex:), + @[ioObj], @[@0], @[outIO], @[@0], nil, nil, @0); + + SEL valSel = NSSelectorFromString( + @"validateRequest:model:"); + BOOL valOk = ((BOOL(*)(id,SEL,id,id))objc_msgSend)( + mapper, valSel, req, targetModel); + printf(" validateRequest: %s\n", + valOk ? "YES" : "NO"); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + if (diskModel1) { + printf("\n --- N.4: _ANEModel.mapper property ---\n"); + @try { + id modelMapper = [diskModel1 valueForKey:@"mapper"]; + printf(" model.mapper: %s\n", + modelMapper + ? [[modelMapper description] UTF8String] + : "nil"); + if (modelMapper) { + dump_all_properties(modelMapper, + [modelMapper class]); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } + } + + // ================================================================= + // Experiment O: Procedure Info Extraction + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment O: Procedure Info Extraction\n"); + printf("==============================================================\n\n"); + { + id targetModel = diskModel1 ? diskModel1 : k1.model; + const char *modelType = diskModel1 ? "_ANEModel" : "InMemoryModel"; + printf(" Using: %s\n", modelType); + + printf("\n --- O.1: procedureInfoForProcedureIndex:0 ---\n"); + @try { + SEL piSel = NSSelectorFromString( + @"procedureInfoForProcedureIndex:"); + if ([targetModel respondsToSelector:piSel]) { + id pInfo = ((id(*)(id,SEL,NSUInteger))objc_msgSend)( + targetModel, piSel, (NSUInteger)0); + printf(" procedureInfo: %s\n", + pInfo ? [[pInfo description] UTF8String] : "nil"); + if (pInfo) { + printf(" class: %s\n", + [NSStringFromClass([pInfo class]) UTF8String]); + dump_all_properties(pInfo, [pInfo class]); + } + } else { + printf(" procedureInfoForProcedureIndex: NOT AVAILABLE\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- O.2: procedureCount ---\n"); + @try { + SEL pcSel = NSSelectorFromString(@"procedureCount"); + if ([targetModel respondsToSelector:pcSel]) { + NSUInteger pc = ((NSUInteger(*)(id,SEL))objc_msgSend)( + targetModel, pcSel); + printf(" procedureCount: %lu\n", (unsigned long)pc); + } else { + printf(" procedureCount: NOT AVAILABLE\n"); + id pcVal = nil; + @try { + pcVal = [targetModel valueForKey:@"procedureCount"]; + printf(" procedureCount (KVC): %s\n", + pcVal ? [[pcVal description] UTF8String] : "nil"); + } @catch (NSException *ex2) { + printf(" procedureCount KVC: %s\n", + [[ex2 reason] UTF8String]); + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- O.3: modelAttributes ---\n"); + @try { + id attrs = [targetModel valueForKey:@"modelAttributes"]; + printf(" modelAttributes: %s\n", + attrs ? [[attrs description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + printf("\n --- O.4: inputSymbolNames / outputSymbolNames ---\n"); + @try { + SEL inNamesSel = NSSelectorFromString(@"inputSymbolNames"); + if ([targetModel respondsToSelector:inNamesSel]) { + id names = ((id(*)(id,SEL))objc_msgSend)( + targetModel, inNamesSel); + printf(" inputSymbolNames: %s\n", + names ? [[names description] UTF8String] : "nil"); + } else { + printf(" inputSymbolNames: NOT AVAILABLE as method\n"); + @try { + id n = [targetModel valueForKey:@"inputSymbolNames"]; + printf(" inputSymbolNames (KVC): %s\n", + n ? [[n description] UTF8String] : "nil"); + } @catch (NSException *ex2) { + printf(" inputSymbolNames KVC: not available\n"); + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + @try { + SEL outNamesSel = NSSelectorFromString(@"outputSymbolNames"); + if ([targetModel respondsToSelector:outNamesSel]) { + id names = ((id(*)(id,SEL))objc_msgSend)( + targetModel, outNamesSel); + printf(" outputSymbolNames: %s\n", + names ? [[names description] UTF8String] : "nil"); + } else { + printf(" outputSymbolNames: NOT AVAILABLE as method\n"); + @try { + id n = [targetModel valueForKey:@"outputSymbolNames"]; + printf(" outputSymbolNames (KVC): %s\n", + n ? [[n description] UTF8String] : "nil"); + } @catch (NSException *ex2) { + printf(" outputSymbolNames KVC: not available\n"); + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + if (diskModel1) { + printf("\n --- O.5: Full _ANEModel property dump ---\n"); + dump_all_properties(diskModel1, + NSClassFromString(@"_ANEModel")); + } + } + + // ================================================================= + // Experiment P: Full Chaining Retry with Fixes + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment P: Full Chaining Retry\n"); + printf("==============================================================\n\n"); + BOOL chainExecuted = NO; + { + void *metalHandle = dlopen( + "/System/Library/Frameworks/Metal.framework/Metal", RTLD_NOW); + (void)metalHandle; + id mtlDev = nil; + { + id (*createDev)(void) = (id(*)(void)) + dlsym(RTLD_DEFAULT, "MTLCreateSystemDefaultDevice"); + if (createDev) mtlDev = createDev(); + } + + id signalEvents = @[]; + id shEvt = nil; + if (mtlDev) { + printf(" Metal device: %s\n", + [[mtlDev description] UTF8String]); + @try { + shEvt = ((id(*)(id,SEL))objc_msgSend)( + mtlDev, NSSelectorFromString(@"newSharedEvent")); + if (shEvt) { + Class gSigEvent = NSClassFromString( + @"_ANESharedSignalEvent"); + if (gSigEvent) { + long long et = 0; + @try { + id etObj = [gSigEvent valueForKey: + @"ANESignalEventTypeMTLSharedEvent"]; + if (etObj) et = [etObj longLongValue]; + } @catch (NSException *ex) { (void)ex; } + + SEL sigSel = NSSelectorFromString( + @"signalEventWithValue:symbolIndex:" + "eventType:sharedEvent:"); + id se = ((id(*)(Class,SEL,uint64_t,unsigned int, + long long,id))objc_msgSend)( + gSigEvent, sigSel, (uint64_t)1, + (unsigned int)0, et, shEvt); + if (se) + signalEvents = @[se]; + printf(" SignalEvent: %s\n", + se ? "created" : "nil"); + } + } + } @catch (NSException *ex) { + printf(" SharedEvent EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + struct { + const char *label; + id model; + } modelCandidates[3]; + int nCandidates = 0; + + CompiledKernel k3 = compile_kernel(64, 32); + if (k3.model) { + NSURL *url = [NSURL fileURLWithPath:k3.tmpDir isDirectory:YES]; + Class gANEModel = NSClassFromString(@"_ANEModel"); + id freshDisk = ((id(*)(Class,SEL,id,id))objc_msgSend)( + gANEModel, @selector(modelAtURL:key:), url, k3.hexId); + if (freshDisk) { + id ph = [k3.model valueForKey:@"programHandle"]; + id prog = [k3.model valueForKey:@"program"]; + if (ph) ((void(*)(id,SEL,uint64_t))objc_msgSend)( + freshDisk, @selector(setProgramHandle:), + [ph unsignedLongLongValue]); + if (prog) ((void(*)(id,SEL,id))objc_msgSend)( + freshDisk, @selector(setProgram:), prog); + + modelCandidates[nCandidates++] = (typeof(modelCandidates[0])) + {"fresh _ANEModel (state=1)", freshDisk}; + } + modelCandidates[nCandidates++] = (typeof(modelCandidates[0])) + {"InMemoryModel (k3)", k3.model}; + } + if (diskModel1) { + modelCandidates[nCandidates++] = (typeof(modelCandidates[0])) + {"populated _ANEModel (from E)", diskModel1}; + } + + for (int mi = 0; mi < nCandidates; mi++) { + printf("\n --- P.%d: %s ---\n", mi+1, modelCandidates[mi].label); + id chainModel = modelCandidates[mi].model; + printf(" class: %s\n", + [NSStringFromClass([chainModel class]) UTF8String]); + @try { + id st = [chainModel valueForKey:@"state"]; + printf(" state: %s\n", + st ? [[st description] UTF8String] : "N/A"); + } @catch (NSException *ex) { (void)ex; } + + id ioObj1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id inBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + ioObj1, @0, (long long)0); + + id outIO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id outBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + outIO, @0, (long long)1); + + IOSurfaceRef sSurf = make_surface(64); + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id))objc_msgSend)( + gOutSets, @selector(objectWithstatsSurRef:outputBuffer:), + sSurf, @[outBuf]); + + @try { + id cr = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId:procedureIndex: + signalEvents:transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[inBuf], @[outSet], nil, nil, nil, + signalEvents, @0, @0, @0); + + if (!cr) { + printf(" ChainingRequest: nil\n"); + CFRelease(sSurf); + continue; + } + + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)( + cr, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(prepareChainingWithModel:options: + chainingReq:qos:error:), + chainModel, @{}, cr, (unsigned int)21, &prepErr); + printf(" prepare: %s\n", prepOk ? "YES" : "NO"); + if (prepErr) + printf(" prepareError: %s\n", + [[prepErr description] UTF8String]); + + if (prepOk) { + printf(" *** PREPARE SUCCEEDED! ***\n"); + chainingPrepSuccess = YES; + chainExecuted = YES; + + printf("\n --- enqueueSetsWithModel ---\n"); + @try { + SEL eqSel = NSSelectorFromString( + @"enqueueSetsWithModel:outputSet:" + "options:qos:error:"); + NSError *eqErr = nil; + BOOL eqOk = ((BOOL(*)(id,SEL,id,id,id, + unsigned int,NSError**))objc_msgSend)( + client, eqSel, chainModel, outSet, @{}, + (unsigned int)21, &eqErr); + printf(" enqueueSets: %s\n", + eqOk ? "YES" : "NO"); + if (eqErr) + printf(" Error: %s\n", + [[eqErr description] UTF8String]); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- buffersReadyWithModel ---\n"); + @try { + SEL brSel = NSSelectorFromString( + @"buffersReadyWithModel:inputBuffers:" + "options:qos:error:"); + NSError *brErr = nil; + BOOL brOk = ((BOOL(*)(id,SEL,id,id,id, + unsigned int,NSError**))objc_msgSend)( + client, brSel, chainModel, @[inBuf], @{}, + (unsigned int)21, &brErr); + printf(" buffersReady: %s\n", + brOk ? "YES" : "NO"); + if (brErr) + printf(" Error: %s\n", + [[brErr description] UTF8String]); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- Benchmark ---\n"); + uint64_t t0 = mach_absolute_time(); + int niters = 50; + for (int i = 0; i < niters; i++) { + @try { + SEL brSel = NSSelectorFromString( + @"buffersReadyWithModel:inputBuffers:" + "options:qos:error:"); + ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)( + client, brSel, chainModel, @[inBuf], + @{}, (unsigned int)21, nil); + } @catch (NSException *ex) { + if (i == 0) + printf(" Bench EXCEPTION: %s\n", + [[ex reason] UTF8String]); + break; + } + } + double elapsed = tb_ms(mach_absolute_time() - t0); + printf(" %d iters in %.3f ms (%.4f ms/iter)\n", + niters, elapsed, elapsed / niters); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + CFRelease(sSurf); + } + + printf("\n --- P.extra: Try with _ANEInputBuffersReady ---\n"); + @try { + Class gIBR = NSClassFromString(@"_ANEInputBuffersReady"); + if (gIBR) { + dump_class("_ANEInputBuffersReady"); + SEL ibrSel = NSSelectorFromString( + @"inputBuffersWithProcedureIndex:inputBufferInfoIndex:" + "inputFreeValue:executionDelay:"); + if (class_getClassMethod(gIBR, ibrSel)) { + Method m = class_getClassMethod(gIBR, ibrSel); + const char *enc = method_getTypeEncoding(m); + printf(" inputBuffersReady encoding: %s\n", + enc ? enc : "?"); + unsigned int na = method_getNumberOfArguments(m); + printf(" args: %u\n", na); + for (unsigned int i = 0; i < na; i++) { + char at[64] = {0}; + method_getArgumentType(m, i, at, sizeof(at)); + printf(" [%u] = %s\n", i, at); + } + } + } + + Class gOSE = NSClassFromString(@"_ANEOutputSetEnqueue"); + if (gOSE) { + dump_class("_ANEOutputSetEnqueue"); + SEL oseSel = NSSelectorFromString( + @"outputSetWithProcedureIndex:setIndex:signalValue:" + "signalNotRequired:isOpenLoop:"); + if (class_getClassMethod(gOSE, oseSel)) { + Method m = class_getClassMethod(gOSE, oseSel); + const char *enc = method_getTypeEncoding(m); + printf(" outputSetEnqueue encoding: %s\n", + enc ? enc : "?"); + unsigned int na = method_getNumberOfArguments(m); + printf(" args: %u\n", na); + for (unsigned int i = 0; i < na; i++) { + char at[64] = {0}; + method_getArgumentType(m, i, at, sizeof(at)); + printf(" [%u] = %s\n", i, at); + } + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + if (k3.model) free_kernel(&k3); + } + + // ================================================================= + // Summary + // ================================================================= + printf("\n============================================================\n"); + printf(" RESULTS SUMMARY\n"); + printf("============================================================\n"); + printf(" Exp E: _ANEModel loaded: %s\n", + diskModel1 ? "YES" : "NO"); + printf(" Exp E2: ANECompiler found: (see above)\n"); + printf(" Exp F: Chaining pipeline: %s\n", + diskModel1 ? "ATTEMPTED" : "SKIPPED"); + printf(" Exp G: SharedEvents: (see above)\n"); + printf(" Exp H: Alt preparation: (see above)\n"); + printf(" Exp K: Type encodings: DONE\n"); + printf(" Exp L: Array params: %s\n", + chainingPrepSuccess ? "PREPARE SUCCEEDED" : "see above"); + printf(" Exp M: Client load model: %s\n", + fullyLoadedModel ? "LOADED" : "see above"); + printf(" Exp N: IOSurface mapping: DONE\n"); + printf(" Exp O: Procedure info: DONE\n"); + printf(" Exp P: Full chaining retry: %s\n", + chainExecuted ? "EXECUTED" : "see above"); + printf("============================================================\n"); + + free_kernel(&k1); + free_kernel(&k2); + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_bench_paths.m b/training/test_bench_paths.m new file mode 100644 index 0000000..437ff48 --- /dev/null +++ b/training/test_bench_paths.m @@ -0,0 +1,148 @@ +// test_bench_paths.m — Benchmark ANE evaluation paths at production dimensions +// Compares: standard, RT, processRequest, and ane_eval_rt wrapper +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; + +#include "ane_runtime.h" + +static NSString *gen_bench_conv(int ch, int sp) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; +} + +int main(int argc, char **argv) { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + + printf("=== ANE Eval Path Benchmark (production dimensions) ===\n\n"); + + ane_init(); + if (!g_ane_ok) { printf("FATAL: ANE not available\n"); return 1; } + + typedef struct { int ch; int sp; const char *label; } TestConfig; + TestConfig configs[] = { + {64, 32, "64x32 (test)"}, + {128, 64, "128x64 (small)"}, + {256, 64, "256x64 (med)"}, + {768, 256, "768x256 (prod)"}, + {512, 64, "512x64 (large)"}, + }; + int nconfigs = sizeof(configs) / sizeof(configs[0]); + int WARMUP = 20, ITERS = 200; + + id client = g_ane_client; + printf(" Client: %s | Warmup: %d | Iters: %d\n\n", client ? "OK" : "NO", WARMUP, ITERS); + printf("%-18s %10s %14s %14s %14s\n", "Config", "Standard", "RT", "ProcReq", "ane_eval_rt"); + printf("%-18s %10s %14s %14s %14s\n", "------", "--------", "--", "-------", "-----------"); + + for (int ci = 0; ci < nconfigs; ci++) { + int CH = configs[ci].ch, SP = configs[ci].sp; + + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)0.5f; + int ws = CH*CH*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot, 1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, w, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + free(w); + + g_fp16_io = 1; + NSString *mil = gen_bench_conv(CH, SP); + NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding]; + size_t ioBytes = CH * SP * 2; + ANEKernel *k = ane_compile(milData, wdata, 1, &ioBytes, 1, &ioBytes); + if (!k) { printf("%-18s (compile failed)\n", configs[ci].label); continue; } + + IOSurfaceLock(k->ioInputs[0], 0, NULL); + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(k->ioInputs[0]); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f; + IOSurfaceUnlock(k->ioInputs[0], 0, NULL); + + NSError *e = nil; + + for (int i = 0; i < WARMUP; i++) ane_eval(k); + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) ane_eval(k); + double std_ms = tb_ms(mach_absolute_time() - t0) / ITERS; + + double rt_ms = -1; + if (client) { + @try { + for (int i = 0; i < WARMUP; i++) + ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + client, @selector(evaluateRealTimeWithModel:options:request:error:), + k->model, @{}, k->request, &e); + t0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) + ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + client, @selector(evaluateRealTimeWithModel:options:request:error:), + k->model, @{}, k->request, &e); + rt_ms = tb_ms(mach_absolute_time() - t0) / ITERS; + } @catch (NSException *ex) { rt_ms = -1; } + } + + double proc_ms = -1; + @try { + id prog = [k->model valueForKey:@"program"]; + id hexId = [k->model valueForKey:@"hexStringIdentifier"]; + SEL procSel = @selector(processRequest:model:qos:qIndex:modelStringID:options:returnValue:error:); + if (prog && [prog respondsToSelector:procSel]) { + for (int i = 0; i < WARMUP; i++) { + BOOL rv = NO; + ((BOOL(*)(id,SEL,id,id,unsigned int,int,id,id,BOOL*,NSError**))objc_msgSend)( + prog, procSel, k->request, k->model, 21, 0, hexId, @{}, &rv, &e); + } + t0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) { + BOOL rv = NO; + ((BOOL(*)(id,SEL,id,id,unsigned int,int,id,id,BOOL*,NSError**))objc_msgSend)( + prog, procSel, k->request, k->model, 21, 0, hexId, @{}, &rv, &e); + } + proc_ms = tb_ms(mach_absolute_time() - t0) / ITERS; + } + } @catch (NSException *ex) { (void)ex; } + + double wrap_ms = -1; + @try { + for (int i = 0; i < WARMUP; i++) ane_eval_rt(k); + t0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) ane_eval_rt(k); + wrap_ms = tb_ms(mach_absolute_time() - t0) / ITERS; + } @catch (NSException *ex) { wrap_ms = -1; } + + char s[32], r[32], p[32], w2[32]; + snprintf(s, 32, "%.3f ms", std_ms); + snprintf(r, 32, rt_ms >= 0 ? "%.3f (%.1fx)" : "N/A", rt_ms, std_ms/rt_ms); + snprintf(p, 32, proc_ms >= 0 ? "%.3f (%.1fx)" : "N/A", proc_ms, std_ms/proc_ms); + snprintf(w2, 32, wrap_ms >= 0 ? "%.3f (%.1fx)" : "N/A", wrap_ms, std_ms/wrap_ms); + printf("%-18s %10s %14s %14s %14s\n", configs[ci].label, s, r, p, w2); + + ane_free(k); + } + + printf("\n=== Benchmark complete ===\n"); + } + return 0; +} diff --git a/training/test_chaining_v2.m b/training/test_chaining_v2.m new file mode 100644 index 0000000..ac8520d --- /dev/null +++ b/training/test_chaining_v2.m @@ -0,0 +1,1700 @@ +// test_chaining_v2.m — Deep exploration of _ANEChainingRequest and related APIs +// Phases: +// 1. Dump unexplored ANE classes (mapper, buffer, output sets, etc.) +// 2. Query compiled model for symbol names and I/O mapping +// 3. Try _ANEProgramIOSurfacesMapper and _ANEBuffer for indexed IOSurfaces +// 4. Retry ChainingRequest with indexed surfaces +// 5. Test real-time eval path and perfStatsMask +// 6. Print structured summary +#import +#import +#import +#import +#import +#import +#include + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } +static int g_fp16_io = 0; + +#pragma mark — Helpers + +static void dump_class(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) { printf(" %s: NOT FOUND\n", name); return; } + printf("\n=== %s ===\n", name); + + unsigned int count; + Method *methods = class_copyMethodList(object_getClass(cls), &count); + if (count) printf(" Class methods (%u):\n", count); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" + %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + methods = class_copyMethodList(cls, &count); + if (count) printf(" Instance methods (%u):\n", count); + for (unsigned int i = 0; i < count; i++) { + SEL s = method_getName(methods[i]); + const char *enc = method_getTypeEncoding(methods[i]); + printf(" - %s [%s]\n", sel_getName(s), enc ? enc : "?"); + } + free(methods); + + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + if (pcount) printf(" Properties (%u):\n", pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + const char *pattr = property_getAttributes(props[i]); + printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); + } + free(props); +} + +static void try_alloc_init(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) return; + @try { + id obj = [[cls alloc] init]; + printf(" %s alloc/init: %s\n", name, + obj ? [[obj description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s alloc/init EXCEPTION: %s\n", name, [[ex reason] UTF8String]); + } +} + +static void dump_all_properties(id obj, Class cls) { + if (!obj) return; + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(cls, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [obj valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname, [[ex reason] UTF8String]); + } + } + free(props); +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +typedef struct { id model; IOSurfaceRef ioIn, ioOut; NSString *tmpDir; } CompiledKernel; + +static NSString *gen_conv_mil(int ch, int sp) { + if (g_fp16_io) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; + } + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\"), val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\"), val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\"), val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\"), val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\"), val=tensor(1)];\n" + " tensor to16 = const()[name=tensor(\"to16\"), val=tensor(\"fp16\")];\n" + " tensor x16 = cast(dtype=to16,x=x)[name=tensor(\"cin\")];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)" + "[name=tensor(\"conv\")];\n" + " tensor to32 = const()[name=tensor(\"to32\"), val=tensor(\"fp32\")];\n" + " tensor y = cast(dtype=to32,x=y16)[name=tensor(\"cout\")];\n" + " } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp]; +} + +static CompiledKernel compile_kernel(Class gD, Class gI, int ch, int sp, NSData *wdata) { + CompiledKernel k = {0}; + NSFileManager *fm = [NSFileManager defaultManager]; + + NSString *mil = gen_conv_mil(ch, sp); + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + + id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(gD, + @selector(modelWithMILText:weights:optionsPlist:), + md, @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}}, nil); + id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(gI, @selector(inMemoryModelWithDescriptor:), desc); + + id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier)); + NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx]; + [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] + withIntermediateDirectories:YES attributes:nil error:nil]; + [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES]; + [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES]; + + NSError *e = nil; + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e); + if (!ok) { + if (!g_fp16_io) { + printf(" fp32 compile failed, retrying with fp16 I/O\n"); + g_fp16_io = 1; + [fm removeItemAtPath:td error:nil]; + return compile_kernel(gD, gI, ch, sp, wdata); + } + printf(" Compile failed: %s\n", e ? [[e description] UTF8String] : "unknown"); + return k; + } + + ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)( + mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e); + + int bpe = g_fp16_io ? 2 : 4; + k.model = mdl; + k.ioIn = make_surface(ch * sp * bpe); + k.ioOut = make_surface(ch * sp * bpe); + k.tmpDir = td; + return k; +} + +#pragma mark — Result tracking + +typedef struct { + bool phase1_done; + int classes_found; + int classes_missing; + + bool phase2_done; + bool has_input_symbols; + bool has_output_symbols; + bool has_program_handle; + + bool phase3_done; + bool mapper_works; + bool buffer_works; + bool got_symbol_index; + + bool phase4_done; + bool validate_passed; + bool chaining_executed; + double sequential_ms; + double chained_ms; + + bool phase5_done; + bool realtime_eval_works; + bool perfstats_works; + uint64_t hw_exec_time_ns; + + bool phase7_done; + bool outputsets_with_stats_works; + bool chaining_with_stats_works; + + bool phase8_done; + bool disk_model_loads; + bool disk_model_has_symbols; + + bool phase9_done; + bool process_request_works; + double process_request_ms; + + bool phase10_done; + bool shared_events_exist; + + double rt_eval_ms; + double std_eval_ms; + double direct_eval_ms; +} Results; + +int main() { + @autoreleasepool { + setbuf(stdout, NULL); + mach_timebase_info(&g_tb); + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW); + + Results R = {0}; + + printf("╔══════════════════════════════════════════════════════════╗\n"); + printf("║ ANE ChainingRequest Deep Exploration v2 ║\n"); + printf("╚══════════════════════════════════════════════════════════╝\n\n"); + + Class gD = NSClassFromString(@"_ANEInMemoryModelDescriptor"); + Class gI = NSClassFromString(@"_ANEInMemoryModel"); + Class gAR = NSClassFromString(@"_ANERequest"); + Class gAIO = NSClassFromString(@"_ANEIOSurfaceObject"); + Class gClient= NSClassFromString(@"_ANEClient"); + Class gChain = NSClassFromString(@"_ANEChainingRequest"); + + if (!gD || !gI || !gAR || !gAIO) { + printf("FATAL: Core ANE classes not found\n"); + return 1; + } + + // ===================================================================== + // PHASE 1: Dump all unexplored ANE classes + // ===================================================================== + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 1: Class Introspection (unexplored classes)\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + + const char *explore_classes[] = { + "_ANEProgramIOSurfacesMapper", + "_ANEBuffer", + "_ANEProgramForEvaluation", + "_ANEIOSurfaceOutputSets", + "_ANEInputBuffersReady", + "_ANEOutputSetEnqueue", + "_ANEModelInstanceParameters", + "_ANEDeviceController", + "_ANEQoSMapper", + NULL + }; + + for (int i = 0; explore_classes[i]; i++) { + Class cls = NSClassFromString([NSString stringWithUTF8String:explore_classes[i]]); + if (cls) R.classes_found++; + else R.classes_missing++; + dump_class(explore_classes[i]); + } + + printf("\n --- Alloc/init tests ---\n"); + for (int i = 0; explore_classes[i]; i++) { + try_alloc_init(explore_classes[i]); + } + + printf("\n --- Also dump _ANEIOSurfaceObject (for symbolIndex) ---\n"); + dump_class("_ANEIOSurfaceObject"); + dump_class("_ANEChainingRequest"); + dump_class("_ANEClient"); + + R.phase1_done = true; + printf("\n Phase 1 complete: %d classes found, %d missing\n", + R.classes_found, R.classes_missing); + + // ===================================================================== + // Compile test kernel (shared by subsequent phases) + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" Compiling test kernels...\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + + int CH = 64, SP = 32; + + _Float16 *w = (_Float16*)calloc(CH*CH, sizeof(_Float16)); + for (int i = 0; i < CH; i++) w[i*CH+i] = (_Float16)0.5f; + int ws = CH*CH*2, tot = 128+ws; + uint8_t *blob = (uint8_t*)calloc(tot, 1); + blob[0]=1; blob[4]=2; blob[64]=0xEF; blob[65]=0xBE; blob[66]=0xAD; blob[67]=0xDE; blob[68]=1; + *(uint32_t*)(blob+72)=ws; *(uint32_t*)(blob+80)=128; + memcpy(blob+128, w, ws); + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES]; + free(w); + + CompiledKernel k1 = compile_kernel(gD, gI, CH, SP, wdata); + CompiledKernel k2 = compile_kernel(gD, gI, CH, SP, wdata); + + if (!k1.model || !k2.model) { + printf("FATAL: Failed to compile test kernels\n"); + return 1; + } + printf(" Kernel 1: compiled and loaded (fp16_io=%d)\n", g_fp16_io); + printf(" Kernel 2: compiled and loaded\n"); + + int bpe = g_fp16_io ? 2 : 4; + int ioBytes = CH * SP * bpe; + + IOSurfaceLock(k1.ioIn, 0, NULL); + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(k1.ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(k1.ioIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + } + IOSurfaceUnlock(k1.ioIn, 0, NULL); + + // ===================================================================== + // PHASE 2: Symbol Name Discovery + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 2: Symbol Name Discovery\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + + const char *model_keys[] = { + "inputSymbolNames", "outputSymbolNames", + "programHandle", "intermediateBufferHandle", + "hexStringIdentifier", "modelDescription", + "inputFeatureNames", "outputFeatureNames", + "perfStatsMask", + "numberOfInputs", "numberOfOutputs", + "compiledModelPath", "compiledModelURL", + "modelPath", "modelURL", + NULL + }; + + printf("\n --- _ANEInMemoryModel properties (k1) ---\n"); + for (int i = 0; model_keys[i]; i++) { + NSString *key = [NSString stringWithUTF8String:model_keys[i]]; + @try { + id val = [k1.model valueForKey:key]; + const char *desc = val ? [[val description] UTF8String] : "nil"; + size_t len = strlen(desc); + if (len > 200) { + printf(" %s = %.200s... (truncated, %zu chars)\n", model_keys[i], desc, len); + } else { + printf(" %s = %s\n", model_keys[i], desc); + } + if (strcmp(model_keys[i], "inputSymbolNames") == 0 && val) R.has_input_symbols = true; + if (strcmp(model_keys[i], "outputSymbolNames") == 0 && val) R.has_output_symbols = true; + if (strcmp(model_keys[i], "programHandle") == 0 && val) R.has_program_handle = true; + } @catch (NSException *ex) { + printf(" %s = \n", model_keys[i], [[ex reason] UTF8String]); + } + } + + printf("\n --- Full property dump of _ANEInMemoryModel ---\n"); + { + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(gI, &pcount); + printf(" (%u properties declared on class)\n", pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + const char *pattr = property_getAttributes(props[i]); + printf(" @property %s [%s]\n", pname, pattr ? pattr : "?"); + @try { + id val = [k1.model valueForKey:[NSString stringWithUTF8String:pname]]; + const char *desc = val ? [[val description] UTF8String] : "nil"; + size_t len = strlen(desc); + if (len > 200) { + printf(" value = %.200s... (truncated)\n", desc); + } else { + printf(" value = %s\n", desc); + } + } @catch (NSException *ex) { + printf(" value = \n", [[ex reason] UTF8String]); + } + } + free(props); + } + + printf("\n --- Walk superclasses for inherited properties ---\n"); + { + Class c = gI; + while (c) { + const char *cname = class_getName(c); + if (strstr(cname, "ANE")) { + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(c, &pcount); + if (pcount > 0) { + printf(" %s (%u props):\n", cname, pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + printf(" @property %s\n", pname); + } + } + free(props); + } + c = class_getSuperclass(c); + } + } + + printf("\n --- _ANEIOSurfaceObject introspection ---\n"); + { + id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioIn); + printf(" wI description: %s\n", [[wI description] UTF8String]); + + unsigned int pcount; + objc_property_t *props = class_copyPropertyList(gAIO, &pcount); + for (unsigned int i = 0; i < pcount; i++) { + const char *pname = property_getName(props[i]); + @try { + id val = [wI valueForKey:[NSString stringWithUTF8String:pname]]; + printf(" %s = %s\n", pname, val ? [[val description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" %s = \n", pname, [[ex reason] UTF8String]); + } + } + free(props); + + @try { + id symIdx = [wI valueForKey:@"symbolIndex"]; + printf(" symbolIndex (KVC): %s\n", symIdx ? [[symIdx description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" symbolIndex (KVC): \n", [[ex reason] UTF8String]); + } + } + + R.phase2_done = true; + printf("\n Phase 2 complete: inputSymbols=%s outputSymbols=%s programHandle=%s\n", + R.has_input_symbols ? "YES" : "NO", + R.has_output_symbols ? "YES" : "NO", + R.has_program_handle ? "YES" : "NO"); + + // Create IOSurface wrapper objects (shared across Phases 3-5) + id wI1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k1.ioIn); + id wO1 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k1.ioOut); + id wI2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k2.ioIn); + id wO2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), k2.ioOut); + + id req1 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI1], @[@0], @[wO1], @[@0], nil, nil, @0); + id req2 = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI2], @[@0], @[wO2], @[@0], nil, nil, @0); + + NSError *e = nil; + + // ===================================================================== + // PHASE 3: IOSurface Mapper & _ANEBuffer + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 3: IOSurface Mapper & Buffer Experiments\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + + uint64_t progHandle = 0; + @try { + id ph = [k1.model valueForKey:@"programHandle"]; + if (ph) progHandle = [ph unsignedLongLongValue]; + } @catch (NSException *ex) { (void)ex; } + printf(" k1 programHandle = %llu\n", progHandle); + + // 3a: Try _ANEProgramIOSurfacesMapper + // API: +mapperWithController:(id)ctrl | +mapperWithProgramHandle:(uint64_t)handle + Class gMapper = NSClassFromString(@"_ANEProgramIOSurfacesMapper"); + if (gMapper) { + printf("\n --- 3a: _ANEProgramIOSurfacesMapper ---\n"); + + // Try mapperWithProgramHandle: (takes uint64_t) + id mapper = nil; + if (progHandle) { + @try { + mapper = ((id(*)(Class,SEL,uint64_t))objc_msgSend)(gMapper, + @selector(mapperWithProgramHandle:), progHandle); + printf(" mapperWithProgramHandle(%llu): %s\n", progHandle, + mapper ? [[mapper description] UTF8String] : "nil"); + if (mapper) { + R.mapper_works = true; + dump_all_properties(mapper, gMapper); + } + } @catch (NSException *ex) { + printf(" mapperWithProgramHandle EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // Try mapperWithController: using model's sharedConnection or DeviceController + if (!mapper) { + @try { + id devCtrl = nil; + Class gDevCtrl = NSClassFromString(@"_ANEDeviceController"); + if (gDevCtrl) { + devCtrl = ((id(*)(Class,SEL,uint64_t))objc_msgSend)(gDevCtrl, + @selector(controllerWithProgramHandle:), progHandle); + printf(" _ANEDeviceController.controllerWithProgramHandle: %s\n", + devCtrl ? [[devCtrl description] UTF8String] : "nil"); + } + if (devCtrl) { + mapper = ((id(*)(Class,SEL,id))objc_msgSend)(gMapper, + @selector(mapperWithController:), devCtrl); + printf(" mapperWithController: %s\n", + mapper ? [[mapper description] UTF8String] : "nil"); + if (mapper) { + R.mapper_works = true; + dump_all_properties(mapper, gMapper); + } + } + } @catch (NSException *ex) { + printf(" mapperWithController EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // Try mapIOSurfacesWithModel:request:cacheInference:error: if we have a mapper + if (mapper) { + printf("\n Trying mapIOSurfacesWithModel...\n"); + id reqMap = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI1], @[@0], @[wO1], @[@0], nil, nil, @0); + @try { + NSError *mapErr = nil; + BOOL mapOk = ((BOOL(*)(id,SEL,id,id,BOOL,NSError**))objc_msgSend)( + mapper, @selector(mapIOSurfacesWithModel:request:cacheInference:error:), + k1.model, reqMap, NO, &mapErr); + printf(" mapIOSurfacesWithModel: %s\n", mapOk ? "YES" : "NO"); + if (!mapOk && mapErr) printf(" error: %s\n", [[mapErr description] UTF8String]); + } @catch (NSException *ex) { + printf(" mapIOSurfacesWithModel EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Also try validateRequest:model: + @try { + BOOL validReq = ((BOOL(*)(id,SEL,id,id))objc_msgSend)( + mapper, @selector(validateRequest:model:), reqMap, k1.model); + printf(" validateRequest:model: %s\n", validReq ? "YES" : "NO"); + } @catch (NSException *ex) { + printf(" validateRequest:model: EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } else { + printf("\n _ANEProgramIOSurfacesMapper: NOT FOUND\n"); + } + + // 3b: Try _ANEBuffer + // API: +bufferWithIOSurfaceObject:(id)ioSurfObj symbolIndex:(id)symIdx source:(long long)src + Class gBuffer = NSClassFromString(@"_ANEBuffer"); + if (gBuffer) { + printf("\n --- 3b: _ANEBuffer ---\n"); + + id wBufTest = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioIn); + + for (long long src = 0; src <= 2; src++) { + @try { + id buf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)(gBuffer, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + wBufTest, @0, src); + printf(" bufferWithIOSurfaceObject(symIdx=0, source=%lld): %s\n", + src, buf ? [[buf description] UTF8String] : "nil"); + if (buf) { + R.buffer_works = true; + dump_all_properties(buf, gBuffer); + @try { + id symIdx = [buf valueForKey:@"symbolIndex"]; + printf(" symbolIndex = %s\n", symIdx ? [[symIdx description] UTF8String] : "nil"); + if (symIdx) R.got_symbol_index = true; + } @catch (NSException *ex) { + printf(" symbolIndex: \n"); + } + } + } @catch (NSException *ex) { + printf(" bufferWithIOSurfaceObject(source=%lld) EXCEPTION: %s\n", + src, [[ex reason] UTF8String]); + } + } + + @try { + id buf1 = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)(gBuffer, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + wBufTest, @1, (long long)0); + printf(" bufferWithIOSurfaceObject(symIdx=1, source=0): %s\n", + buf1 ? [[buf1 description] UTF8String] : "nil"); + if (buf1) { + R.buffer_works = true; + dump_all_properties(buf1, gBuffer); + } + } @catch (NSException *ex) { + printf(" bufferWithIOSurfaceObject(symIdx=1) EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf("\n _ANEBuffer: NOT FOUND\n"); + } + + // 3c: Try _ANEIOSurfaceObject with symbolIndex setter + printf("\n --- 3c: _ANEIOSurfaceObject symbolIndex experiment ---\n"); + { + id wTest = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioIn); + + if ([wTest respondsToSelector:@selector(setSymbolIndex:)]) { + printf(" setSymbolIndex: is available!\n"); + @try { + ((void(*)(id,SEL,NSUInteger))objc_msgSend)(wTest, @selector(setSymbolIndex:), 0); + printf(" setSymbolIndex:0 succeeded\n"); + R.got_symbol_index = true; + } @catch (NSException *ex) { + printf(" setSymbolIndex:0 EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" setSymbolIndex: NOT available on _ANEIOSurfaceObject\n"); + } + + if ([wTest respondsToSelector:NSSelectorFromString(@"symbolIndex")]) { + printf(" symbolIndex getter: available\n"); + @try { + NSUInteger idx = ((NSUInteger(*)(id,SEL))objc_msgSend)(wTest, + NSSelectorFromString(@"symbolIndex")); + printf(" symbolIndex = %lu\n", (unsigned long)idx); + } @catch (NSException *ex) { + printf(" symbolIndex getter EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" symbolIndex getter: NOT available\n"); + } + + SEL selObjWithSurfaceIdx = NSSelectorFromString(@"objectWithIOSurface:symbolIndex:"); + if ([gAIO respondsToSelector:selObjWithSurfaceIdx]) { + printf(" +objectWithIOSurface:symbolIndex: is available!\n"); + @try { + id wIndexed = ((id(*)(Class,SEL,IOSurfaceRef,NSUInteger))objc_msgSend)( + gAIO, selObjWithSurfaceIdx, k1.ioIn, (NSUInteger)0); + printf(" result: %s\n", wIndexed ? [[wIndexed description] UTF8String] : "nil"); + if (wIndexed) R.got_symbol_index = true; + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" +objectWithIOSurface:symbolIndex: NOT available\n"); + } + } + + // 3d: Try setting symbolIndex on IOSurface itself (as IOSurface property) + printf("\n --- 3d: IOSurface property experiments ---\n"); + { + IOSurfaceLock(k1.ioIn, 0, NULL); + IOSurfaceSetValue(k1.ioIn, CFSTR("symbolIndex"), (__bridge CFTypeRef)@0); + IOSurfaceUnlock(k1.ioIn, 0, NULL); + + CFTypeRef val = IOSurfaceCopyValue(k1.ioIn, CFSTR("symbolIndex")); + printf(" IOSurface 'symbolIndex' property: %s\n", + val ? [(__bridge id)val description].UTF8String : "nil"); + if (val) CFRelease(val); + + id wWithProp = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, + @selector(objectWithIOSurface:), k1.ioIn); + @try { + id symIdx = [wWithProp valueForKey:@"symbolIndex"]; + printf(" _ANEIOSurfaceObject.symbolIndex after IOSurface property set: %s\n", + symIdx ? [[symIdx description] UTF8String] : "nil"); + if (symIdx) R.got_symbol_index = true; + } @catch (NSException *ex) { + printf(" _ANEIOSurfaceObject.symbolIndex: \n", [[ex reason] UTF8String]); + } + } + + // 3e: Try _ANEProgramForEvaluation + // API: +programWithHandle:(uint64_t)handle intermediateBufferHandle:(uint64_t)ibh queueDepth:(char)qd + // +programWithController:(id)ctrl intermediateBufferHandle:(uint64_t)ibh queueDepth:(char)qd + Class gProgEval = NSClassFromString(@"_ANEProgramForEvaluation"); + if (gProgEval) { + printf("\n --- 3e: _ANEProgramForEvaluation ---\n"); + + // The model already has a .program property -- read it directly + @try { + id existingProg = [k1.model valueForKey:@"program"]; + printf(" k1.model.program: %s\n", + existingProg ? [[existingProg description] UTF8String] : "nil"); + if (existingProg) { + dump_all_properties(existingProg, gProgEval); + } + } @catch (NSException *ex) { + printf(" k1.model.program: \n", [[ex reason] UTF8String]); + } + + // Try programWithHandle:intermediateBufferHandle:queueDepth: + uint64_t ibHandle = 0; + @try { + id ibh = [k1.model valueForKey:@"intermediateBufferHandle"]; + if (ibh) ibHandle = [ibh unsignedLongLongValue]; + } @catch (NSException *ex) { (void)ex; } + + @try { + id prog = ((id(*)(Class,SEL,uint64_t,uint64_t,char))objc_msgSend)(gProgEval, + @selector(programWithHandle:intermediateBufferHandle:queueDepth:), + progHandle, ibHandle, (char)1); + printf(" programWithHandle(%llu, %llu, 1): %s\n", + progHandle, ibHandle, + prog ? [[prog description] UTF8String] : "nil"); + if (prog) dump_all_properties(prog, gProgEval); + } @catch (NSException *ex) { + printf(" programWithHandle EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // 3f: Try _ANEIOSurfaceOutputSets and _ANEInputBuffersReady + const char *chain_helper_classes[] = { + "_ANEIOSurfaceOutputSets", + "_ANEInputBuffersReady", + "_ANEOutputSetEnqueue", + NULL + }; + for (int ci = 0; chain_helper_classes[ci]; ci++) { + Class cls = NSClassFromString([NSString stringWithUTF8String:chain_helper_classes[ci]]); + if (!cls) continue; + printf("\n --- 3f: %s instantiation ---\n", chain_helper_classes[ci]); + + unsigned int mc = 0; + Method *ms = class_copyMethodList(object_getClass(cls), &mc); + for (unsigned int i = 0; i < mc; i++) { + SEL s = method_getName(ms[i]); + printf(" + %s\n", sel_getName(s)); + } + free(ms); + + @try { + id obj = [[cls alloc] init]; + printf(" alloc/init: %s\n", obj ? [[obj description] UTF8String] : "nil"); + if (obj) dump_all_properties(obj, cls); + } @catch (NSException *ex) { + printf(" alloc/init EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + R.phase3_done = true; + printf("\n Phase 3 complete: mapper=%s buffer=%s symbolIndex=%s\n", + R.mapper_works ? "YES" : "NO", + R.buffer_works ? "YES" : "NO", + R.got_symbol_index ? "YES" : "NO"); + + // ===================================================================== + // PHASE 4: ChainingRequest with (potentially) indexed surfaces + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 4: ChainingRequest Retry\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + + // 4a: Sequential baseline + printf("\n --- 4a: Sequential baseline ---\n"); + int WARMUP = 5, ITERS = 50; + for (int i = 0; i < WARMUP; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + IOSurfaceLock(k1.ioOut, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k2.ioIn), IOSurfaceGetBaseAddress(k1.ioOut), ioBytes); + IOSurfaceUnlock(k1.ioOut, 0, NULL); + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k2.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req2, &e); + } + + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + IOSurfaceLock(k1.ioOut, 0, NULL); + memcpy(IOSurfaceGetBaseAddress(k2.ioIn), IOSurfaceGetBaseAddress(k1.ioOut), ioBytes); + IOSurfaceUnlock(k1.ioOut, 0, NULL); + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k2.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req2, &e); + } + R.sequential_ms = tb_ms(mach_absolute_time() - t0) / ITERS; + printf(" Sequential: %.3f ms/pair (%d iters)\n", R.sequential_ms, ITERS); + + IOSurfaceLock(k2.ioOut, kIOSurfaceLockReadOnly, NULL); + if (g_fp16_io) { + _Float16 *out = (_Float16*)IOSurfaceGetBaseAddress(k2.ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", + (float)out[0], (float)out[1], (float)out[2], (float)out[3]); + } else { + float *out = (float*)IOSurfaceGetBaseAddress(k2.ioOut); + printf(" Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out[0], out[1], out[2], out[3]); + } + IOSurfaceUnlock(k2.ioOut, kIOSurfaceLockReadOnly, NULL); + + // 4b: ChainingRequest attempts + printf("\n --- 4b: ChainingRequest attempts ---\n"); + + id client = nil; + if (gClient) { + client = [gClient performSelector:@selector(sharedConnection)]; + printf(" _ANEClient: %s\n", client ? "obtained" : "FAILED"); + } + + if (gChain && client) { + // Attempt 1: standard (same as v1) + printf("\n [Attempt 1] Standard ChainingRequest (raw IOSurface objects)\n"); + @try { + id chainReq = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wI1], @[@[wO1]], @[@0], @[@0], @0, @[], @0, @0, @0); + + if (chainReq) { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainReq, @selector(validate)); + printf(" created: YES | validate: %s\n", valid ? "YES" : "NO"); + R.validate_passed = valid; + + if (valid && client) { + @try { + BOOL prep = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainReq, 21, &e); + printf(" prepareChainingWithModel: %s\n", prep ? "YES" : "NO"); + if (!prep && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" prepareChainingWithModel EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } else { + printf(" created: NO\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Attempt 2: with IOSurface property "symbolIndex" + printf("\n [Attempt 2] IOSurface with symbolIndex property\n"); + @try { + IOSurfaceRef sIn = make_surface(ioBytes); + IOSurfaceRef sOut = make_surface(ioBytes); + + IOSurfaceLock(sIn, 0, NULL); + if (g_fp16_io) { + _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(sIn); + for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f; + } else { + float *inp = (float*)IOSurfaceGetBaseAddress(sIn); + for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f; + } + IOSurfaceSetValue(sIn, CFSTR("symbolIndex"), (__bridge CFTypeRef)@0); + IOSurfaceUnlock(sIn, 0, NULL); + + IOSurfaceLock(sOut, 0, NULL); + IOSurfaceSetValue(sOut, CFSTR("symbolIndex"), (__bridge CFTypeRef)@0); + IOSurfaceUnlock(sOut, 0, NULL); + + id wIn2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), sIn); + id wOut2 = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), sOut); + + id chainReq2 = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wIn2], @[@[wOut2]], @[@0], @[@0], @0, @[], @0, @0, @0); + + if (chainReq2) { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainReq2, @selector(validate)); + printf(" created: YES | validate: %s\n", valid ? "YES" : "NO"); + if (valid) R.validate_passed = true; + + if (valid) { + @try { + BOOL prep = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainReq2, 21, &e); + printf(" prepareChainingWithModel: %s\n", prep ? "YES" : "NO"); + if (prep) R.chaining_executed = true; + } @catch (NSException *ex) { + printf(" prepareChainingWithModel EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } + + CFRelease(sIn); CFRelease(sOut); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Attempt 3: two-model loopback with output sets + printf("\n [Attempt 3] Two-model loopback with multiple output sets\n"); + @try { + IOSurfaceRef sMid = make_surface(ioBytes); + id wMid = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(gAIO, @selector(objectWithIOSurface:), sMid); + + id chainLoop = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wI1], + @[@[wMid], @[wO2]], + @[@0], + @[@0], + @0, @[], @0, @0, @0); + + if (chainLoop) { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainLoop, @selector(validate)); + printf(" created: YES | validate: %s\n", valid ? "YES" : "NO"); + + @try { + BOOL prep = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainLoop, 21, &e); + printf(" prepareChainingWithModel: %s\n", prep ? "YES" : "NO"); + if (!prep && e) printf(" error: %s\n", [[e description] UTF8String]); + + if (prep) { + R.chaining_executed = true; + + uint64_t tc0 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) { + BOOL enq = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(enqueueSetsWithModel:outputSet:options:qos:error:), + k1.model, @[wMid], @{}, 21, &e); + (void)enq; + BOOL buf = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(buffersReadyWithModel:inputBuffers:options:qos:error:), + k1.model, @[wI1], @{}, 21, &e); + (void)buf; + } + R.chained_ms = tb_ms(mach_absolute_time() - tc0) / ITERS; + printf(" Chained: %.3f ms/pair (%d iters)\n", R.chained_ms, ITERS); + printf(" Speedup: %.2fx vs sequential\n", R.sequential_ms / R.chained_ms); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + CFRelease(sMid); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Attempt 4: force validate bypass via prepareChainingWithModel directly + printf("\n [Attempt 4] Skip validate, call prepareChainingWithModel directly\n"); + @try { + id chainDirect = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[wI1], @[@[wO1]], @[@0], @[@0], @0, @[], @0, @0, @0); + + if (chainDirect) { + BOOL prep = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainDirect, 21, &e); + printf(" prepareChainingWithModel (no validate): %s\n", prep ? "YES" : "NO"); + if (!prep && e) printf(" error: %s\n", [[e description] UTF8String]); + + if (prep) { + R.chaining_executed = true; + + @try { + BOOL enq = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(enqueueSetsWithModel:outputSet:options:qos:error:), + k1.model, @[wO1], @{}, 21, &e); + printf(" enqueueSets: %s\n", enq ? "YES" : "NO"); + if (!enq && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" enqueueSets EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + @try { + BOOL buf = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(buffersReadyWithModel:inputBuffers:options:qos:error:), + k1.model, @[wI1], @{}, 21, &e); + printf(" buffersReady: %s\n", buf ? "YES" : "NO"); + if (!buf && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" buffersReady EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + // Attempt 5: Use _ANEBuffer inputs + _ANEIOSurfaceOutputSets for outputSets + printf("\n [Attempt 5] ChainingRequest with _ANEBuffer + _ANEIOSurfaceOutputSets\n"); + { + Class gOutSets = NSClassFromString(@"_ANEIOSurfaceOutputSets"); + if (gBuffer && gOutSets) { + @try { + id bufIn = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)(gBuffer, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + wI1, @0, (long long)0); + id bufOut = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)(gBuffer, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + wO1, @0, (long long)1); + printf(" bufIn: %s\n", bufIn ? [[bufIn description] UTF8String] : "nil"); + printf(" bufOut: %s\n", bufOut ? [[bufOut description] UTF8String] : "nil"); + + // Create _ANEIOSurfaceOutputSets: +objectWithstatsSurRef:outputBuffer: + // statsSurRef can be NULL, outputBuffer is NSArray of _ANEBuffer + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id))objc_msgSend)(gOutSets, + @selector(objectWithstatsSurRef:outputBuffer:), + NULL, @[bufOut]); + printf(" outputSet: %s\n", outSet ? [[outSet description] UTF8String] : "nil"); + + if (bufIn && outSet) { + id chainBuf = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[bufIn], @[outSet], @[@0], @[@0], @0, @[], @0, @0, @0); + + if (chainBuf) { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)(chainBuf, @selector(validate)); + printf(" created: YES | validate: %s\n", valid ? "YES" : "NO"); + if (valid) R.validate_passed = true; + + @try { + BOOL prep = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chainBuf, 21, &e); + printf(" prepareChainingWithModel: %s\n", prep ? "YES" : "NO"); + if (!prep && e) printf(" error: %s\n", [[e description] UTF8String]); + if (prep) R.chaining_executed = true; + } @catch (NSException *ex) { + printf(" prepareChainingWithModel EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" ChainingRequest creation: nil\n"); + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" _ANEBuffer or _ANEIOSurfaceOutputSets not available\n"); + } + } + + // Attempt 6: Use _ANEClient.evaluateWithModel (5-param variant) + printf("\n [Attempt 6] _ANEClient.evaluateWithModel:options:request:qos:error:\n"); + @try { + BOOL clientEval = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(evaluateWithModel:options:request:qos:error:), + k1.model, @{}, req1, 21, &e); + printf(" evaluateWithModel (via client): %s\n", clientEval ? "YES" : "NO"); + if (!clientEval && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Attempt 7: Use _ANEClient.doEvaluateDirectWithModel + printf("\n [Attempt 7] _ANEClient.doEvaluateDirectWithModel\n"); + @try { + BOOL directEval = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(doEvaluateDirectWithModel:options:request:qos:error:), + k1.model, @{}, req1, 21, &e); + printf(" doEvaluateDirectWithModel: %s\n", directEval ? "YES" : "NO"); + if (!directEval && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" Skipped: gChain=%s client=%s\n", + gChain ? "YES" : "NO", client ? "YES" : "NO"); + } + + R.phase4_done = true; + + // ===================================================================== + // PHASE 5: Alternative Execution Paths + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 5: Alternative Execution Paths\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + + // 5a: Real-time eval + printf("\n --- 5a: Real-time eval path ---\n"); + if (client) { + @try { + printf(" Calling beginRealTimeTask...\n"); + BOOL rtBegin = ((BOOL(*)(id,SEL))objc_msgSend)(client, @selector(beginRealTimeTask)); + printf(" beginRealTimeTask: %s\n", rtBegin ? "YES" : "NO"); + + printf(" Calling evaluateRealTimeWithModel...\n"); + BOOL rtOk = ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + client, @selector(evaluateRealTimeWithModel:options:request:error:), + k1.model, @{}, req1, &e); + printf(" evaluateRealTimeWithModel: %s\n", rtOk ? "YES" : "NO"); + if (!rtOk && e) printf(" error: %s\n", [[e description] UTF8String]); + R.realtime_eval_works = rtOk; + + if (rtOk) { + double rt_times[ITERS]; + for (int i = 0; i < WARMUP; i++) { + ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + client, @selector(evaluateRealTimeWithModel:options:request:error:), + k1.model, @{}, req1, &e); + } + for (int i = 0; i < ITERS; i++) { + uint64_t ti = mach_absolute_time(); + ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + client, @selector(evaluateRealTimeWithModel:options:request:error:), + k1.model, @{}, req1, &e); + rt_times[i] = tb_ms(mach_absolute_time() - ti); + } + double rt_sum = 0; + for (int i = 0; i < ITERS; i++) rt_sum += rt_times[i]; + printf(" RT eval: %.3f ms/eval avg (%d iters)\n", rt_sum/ITERS, ITERS); + + double std_times[ITERS]; + for (int i = 0; i < ITERS; i++) { + uint64_t ti = mach_absolute_time(); + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + std_times[i] = tb_ms(mach_absolute_time() - ti); + } + double std_sum = 0; + for (int i = 0; i < ITERS; i++) std_sum += std_times[i]; + printf(" Standard eval: %.3f ms/eval avg (%d iters)\n", std_sum/ITERS, ITERS); + printf(" RT vs Standard speedup: %.2fx\n", (std_sum/ITERS) / (rt_sum/ITERS)); + } + + BOOL rtEnd = ((BOOL(*)(id,SEL))objc_msgSend)(client, @selector(endRealTimeTask)); + printf(" endRealTimeTask: %s\n", rtEnd ? "YES" : "NO"); + } @catch (NSException *ex) { + printf(" Real-time eval EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + // 5b: PerfStats with perfStatsMask + printf("\n --- 5b: PerfStats with perfStatsMask ---\n"); + { + Class perfClass = NSClassFromString(@"_ANEPerformanceStats"); + if (perfClass) { + @try { + printf(" Setting perfStatsMask on model...\n"); + for (unsigned int mask = 1; mask <= 0xFF; mask <<= 1) { + @try { + [k1.model setValue:@(mask) forKey:@"perfStatsMask"]; + printf(" perfStatsMask = 0x%02X: set OK\n", mask); + } @catch (NSException *ex) { + printf(" perfStatsMask = 0x%02X: \n", mask, [[ex reason] UTF8String]); + break; + } + } + } @catch (NSException *ex) { + printf(" perfStatsMask setter: \n", [[ex reason] UTF8String]); + } + + id perfStats = nil; + @try { + perfStats = ((id(*)(Class,SEL,uint64_t))objc_msgSend)(perfClass, + @selector(statsWithHardwareExecutionNS:), (uint64_t)0); + printf(" statsWithHardwareExecutionNS:0 = %s\n", + perfStats ? [[perfStats description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" statsWithHardwareExecutionNS: EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + if (!perfStats) { + @try { + perfStats = [[perfClass alloc] init]; + printf(" alloc/init fallback = %s\n", + perfStats ? [[perfStats description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" alloc/init EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + if (perfStats) { + // perfStats param expects NSArray (request calls [perfStats count]) + // Try wrapping in array, and also try with nil + perfStatsMask + printf(" Test A: perfStats as NSArray wrapper\n"); + @try { + id reqPerfA = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI1], @[@0], @[wO1], @[@0], nil, @[perfStats], @0); + if (reqPerfA) { + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, reqPerfA, &e); + printf(" Eval with @[perfStats]: %s\n", ok ? "OK" : "FAIL"); + if (ok) { + printf(" PerfStats after eval:\n"); + dump_all_properties(perfStats, perfClass); + @try { + id hwTime = [perfStats valueForKey:@"hwExecutionTime"]; + printf(" hwExecutionTime = %s\n", + hwTime ? [[hwTime description] UTF8String] : "nil"); + if (hwTime) { + R.hw_exec_time_ns = [hwTime unsignedLongLongValue]; + R.perfstats_works = (R.hw_exec_time_ns > 0); + } + } @catch (NSException *ex) { + printf(" hwExecutionTime: \n"); + } + } + } + } @catch (NSException *ex) { + printf(" Test A EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Test B: perfStatsMask set, but perfStats=nil in request + printf(" Test B: perfStatsMask=0xFF, perfStats=nil\n"); + @try { + [k1.model setValue:@(0xFF) forKey:@"perfStatsMask"]; + id reqPerfB = ((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[wI1], @[@0], @[wO1], @[@0], nil, nil, @0); + BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, reqPerfB, &e); + printf(" Eval with mask=0xFF, perfStats=nil: %s\n", ok ? "OK" : "FAIL"); + uint64_t psMask = [[k1.model valueForKey:@"perfStatsMask"] unsignedIntValue]; + printf(" perfStatsMask after eval: 0x%llX\n", (unsigned long long)psMask); + } @catch (NSException *ex) { + printf(" Test B EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + @try { + id counters = [perfStats performSelector:@selector(performanceCounters)]; + printf(" performanceCounters: %s\n", + counters ? [[counters description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" performanceCounters: \n", [[ex reason] UTF8String]); + } + } + } else { + printf(" _ANEPerformanceStats: NOT FOUND\n"); + } + } + + R.phase5_done = true; + + // ===================================================================== + // PHASE 7 (Exp A): _ANEIOSurfaceOutputSets with non-NULL statsSurRef + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 7: OutputSets with stats IOSurface\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + { + Class gOutSets = NSClassFromString(@"_ANEIOSurfaceOutputSets"); + Class gBuf = NSClassFromString(@"_ANEBuffer"); + if (gOutSets && gBuf) { + id bufOut7 = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + wO1, @0, (long long)1); + + size_t stats_sizes[] = {64, 256, 1024, 4096, 16384}; + for (int si = 0; si < 5; si++) { + IOSurfaceRef statsSurf = make_surface(stats_sizes[si]); + printf("\n statsSurRef size=%zu bytes:\n", stats_sizes[si]); + + @try { + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id))objc_msgSend)(gOutSets, + @selector(objectWithstatsSurRef:outputBuffer:), + statsSurf, @[bufOut7]); + printf(" objectWithstatsSurRef: %s\n", + outSet ? [[outSet description] UTF8String] : "nil"); + + if (outSet) { + R.outputsets_with_stats_works = true; + dump_all_properties(outSet, gOutSets); + + printf("\n Attempting ChainingRequest with valid outputSet...\n"); + id bufIn7 = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + wI1, @0, (long long)0); + + @try { + id chain7 = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[bufIn7], @[outSet], @[@0], @[@0], @0, @[], @0, @0, @0); + + if (chain7) { + BOOL valid7 = ((BOOL(*)(id,SEL))objc_msgSend)(chain7, @selector(validate)); + printf(" ChainingRequest created | validate: %s\n", valid7 ? "YES" : "NO"); + if (valid7) R.chaining_with_stats_works = true; + + @try { + BOOL prep7 = ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(prepareChainingWithModel:options:chainingReq:qos:error:), + k1.model, @{}, chain7, 21, &e); + printf(" prepareChainingWithModel: %s\n", prep7 ? "YES" : "NO"); + if (!prep7 && e) printf(" error: %s\n", [[e description] UTF8String]); + if (prep7) R.chaining_with_stats_works = true; + } @catch (NSException *ex) { + printf(" prepareChainingWithModel EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } else { + printf(" ChainingRequest creation: nil\n"); + } + } @catch (NSException *ex) { + printf(" ChainingRequest EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + CFRelease(statsSurf); + if (R.outputsets_with_stats_works) break; + } + } else { + printf(" Required classes not found\n"); + } + } + R.phase7_done = true; + + // ===================================================================== + // PHASE 8 (Exp B): Disk-based _ANEModel for symbol discovery + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 8: Disk-based _ANEModel path\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + { + Class gANEModel = NSClassFromString(@"_ANEModel"); + if (gANEModel) { + printf("\n _ANEModel class found. Dumping API surface...\n"); + dump_class("_ANEModel"); + + NSString *compiledPath = nil; + @try { + compiledPath = [k1.model valueForKey:@"compiledModelPath"]; + printf(" k1.compiledModelPath: %s\n", + compiledPath ? [compiledPath UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" compiledModelPath: \n"); + } + if (!compiledPath) { + @try { + id url = [k1.model valueForKey:@"compiledModelURL"]; + if (url) compiledPath = [url path]; + printf(" k1.compiledModelURL.path: %s\n", + compiledPath ? [compiledPath UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" compiledModelURL: \n"); + } + } + + NSString *modelDir = k1.tmpDir; + printf(" k1.tmpDir: %s\n", [modelDir UTF8String]); + + NSFileManager *fm8 = [NSFileManager defaultManager]; + NSArray *contents = [fm8 contentsOfDirectoryAtPath:modelDir error:nil]; + printf(" tmpDir contents: %s\n", contents ? [[contents description] UTF8String] : "empty"); + + SEL factorySelectors[] = { + @selector(modelAtURL:), + @selector(modelWithPath:), + NSSelectorFromString(@"modelAtPath:"), + NSSelectorFromString(@"modelWithURL:"), + }; + const char *factoryNames[] = {"modelAtURL:", "modelWithPath:", "modelAtPath:", "modelWithURL:"}; + + for (int fi = 0; fi < 4; fi++) { + if ([gANEModel respondsToSelector:factorySelectors[fi]]) { + printf(" +%s: available\n", factoryNames[fi]); + } else { + printf(" +%s: NOT available\n", factoryNames[fi]); + } + } + + id diskModel = nil; + @try { + if ([gANEModel respondsToSelector:@selector(modelAtURL:)]) { + NSURL *dirURL = [NSURL fileURLWithPath:modelDir]; + diskModel = ((id(*)(Class,SEL,id))objc_msgSend)(gANEModel, + @selector(modelAtURL:), dirURL); + printf(" modelAtURL: %s\n", diskModel ? [[diskModel description] UTF8String] : "nil"); + } + } @catch (NSException *ex) { + printf(" modelAtURL EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + if (!diskModel) { + @try { + SEL s = NSSelectorFromString(@"modelAtPath:"); + if ([gANEModel respondsToSelector:s]) { + diskModel = ((id(*)(Class,SEL,id))objc_msgSend)(gANEModel, s, modelDir); + printf(" modelAtPath: %s\n", diskModel ? [[diskModel description] UTF8String] : "nil"); + } + } @catch (NSException *ex) { + printf(" modelAtPath EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + if (diskModel) { + R.disk_model_loads = true; + printf("\n _ANEModel loaded! Querying symbol names...\n"); + dump_all_properties(diskModel, gANEModel); + + const char *symbol_keys[] = { + "inputSymbolNames", "outputSymbolNames", + "inputSymbolIndicesForProcedureIndex:", + "outputSymbolIndicesForProcedureIndex:", + NULL + }; + for (int ki = 0; symbol_keys[ki]; ki++) { + @try { + if (strchr(symbol_keys[ki], ':')) { + SEL s = NSSelectorFromString( + [NSString stringWithUTF8String:symbol_keys[ki]]); + if ([diskModel respondsToSelector:s]) { + id result = ((id(*)(id,SEL,NSUInteger))objc_msgSend)( + diskModel, s, (NSUInteger)0); + printf(" %s(0) = %s\n", symbol_keys[ki], + result ? [[result description] UTF8String] : "nil"); + R.disk_model_has_symbols = (result != nil); + } else { + printf(" %s: NOT available\n", symbol_keys[ki]); + } + } else { + id val = [diskModel valueForKey: + [NSString stringWithUTF8String:symbol_keys[ki]]]; + printf(" %s = %s\n", symbol_keys[ki], + val ? [[val description] UTF8String] : "nil"); + if (val) R.disk_model_has_symbols = true; + } + } @catch (NSException *ex) { + printf(" %s: \n", symbol_keys[ki], [[ex reason] UTF8String]); + } + } + } else { + printf(" _ANEModel could not be loaded from tmpDir\n"); + } + } else { + printf(" _ANEModel: NOT FOUND\n"); + } + } + R.phase8_done = true; + + // ===================================================================== + // PHASE 9 (Exp C): processRequest via _ANEProgramForEvaluation + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 9: processRequest via ProgramForEvaluation\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + { + @try { + id prog = [k1.model valueForKey:@"program"]; + if (prog) { + printf(" k1.model.program: %s\n", [[prog description] UTF8String]); + + id hexId = [k1.model valueForKey:@"hexStringIdentifier"]; + printf(" hexStringIdentifier: %s\n", hexId ? [[hexId description] UTF8String] : "nil"); + + SEL procSel = @selector(processRequest:model:qos:qIndex:modelStringID:options:returnValue:error:); + if ([prog respondsToSelector:procSel]) { + printf(" processRequest selector: available\n"); + + for (int warmup = 0; warmup < WARMUP; warmup++) { + @try { + BOOL rv = NO; + ((BOOL(*)(id,SEL,id,id,unsigned int,int,id,id,BOOL*,NSError**))objc_msgSend)( + prog, procSel, req1, k1.model, 21, 0, hexId, @{}, &rv, &e); + } @catch (NSException *ex) { (void)ex; } + } + + BOOL firstOk = NO; + @try { + BOOL rv = NO; + firstOk = ((BOOL(*)(id,SEL,id,id,unsigned int,int,id,id,BOOL*,NSError**))objc_msgSend)( + prog, procSel, req1, k1.model, 21, 0, hexId, @{}, &rv, &e); + printf(" processRequest single call: %s (rv=%s)\n", + firstOk ? "YES" : "NO", rv ? "YES" : "NO"); + if (!firstOk && e) printf(" error: %s\n", [[e description] UTF8String]); + } @catch (NSException *ex) { + printf(" processRequest EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + if (firstOk) { + R.process_request_works = true; + + uint64_t t9 = mach_absolute_time(); + for (int i = 0; i < ITERS; i++) { + BOOL rv = NO; + ((BOOL(*)(id,SEL,id,id,unsigned int,int,id,id,BOOL*,NSError**))objc_msgSend)( + prog, procSel, req1, k1.model, 21, 0, hexId, @{}, &rv, &e); + } + R.process_request_ms = tb_ms(mach_absolute_time() - t9) / ITERS; + printf(" processRequest: %.3f ms/eval (%d iters)\n", + R.process_request_ms, ITERS); + printf(" vs RT eval: %.2fx\n", + R.process_request_ms / (R.rt_eval_ms > 0 ? R.rt_eval_ms : 0.090)); + } + } else { + printf(" processRequest selector: NOT available\n"); + } + } else { + printf(" k1.model.program: nil\n"); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + R.phase9_done = true; + + // ===================================================================== + // PHASE 10 (Exp D): Shared Events for hardware synchronization + // ===================================================================== + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" PHASE 10: Shared Events (hardware sync)\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + { + const char *event_classes[] = { + "_ANESharedEvents", + "_ANESharedSignalEvent", + "_ANESharedWaitEvent", + NULL + }; + + for (int ei = 0; event_classes[ei]; ei++) { + Class cls = NSClassFromString([NSString stringWithUTF8String:event_classes[ei]]); + if (cls) { + R.shared_events_exist = true; + dump_class(event_classes[ei]); + + @try { + id obj = [[cls alloc] init]; + printf(" %s alloc/init: %s\n", event_classes[ei], + obj ? [[obj description] UTF8String] : "nil"); + if (obj) dump_all_properties(obj, cls); + } @catch (NSException *ex) { + printf(" %s alloc/init EXCEPTION: %s\n", event_classes[ei], + [[ex reason] UTF8String]); + } + } else { + printf(" %s: NOT FOUND\n", event_classes[ei]); + } + } + + if (R.shared_events_exist && gChain && client) { + printf("\n Attempting ChainingRequest with shared events...\n"); + Class sigCls = NSClassFromString(@"_ANESharedSignalEvent"); + Class waitCls = NSClassFromString(@"_ANESharedWaitEvent"); + + if (sigCls && waitCls) { + @try { + id sigEvent = [[sigCls alloc] init]; + id waitEvent = [[waitCls alloc] init]; + printf(" signalEvent: %s\n", sigEvent ? [[sigEvent description] UTF8String] : "nil"); + printf(" waitEvent: %s\n", waitEvent ? [[waitEvent description] UTF8String] : "nil"); + + if (sigEvent) { + Class gBuf = NSClassFromString(@"_ANEBuffer"); + id bufIn10 = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + wI1, @0, (long long)0); + + id chain10 = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id))objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets:lbInputSymbolId:lbOutputSymbolId:procedureIndex:signalEvents:transactionHandle:fwEnqueueDelay:memoryPoolId:), + @[bufIn10], @[], @[@0], @[@0], @0, @[sigEvent], @0, @0, @0); + + if (chain10) { + printf(" ChainingRequest with signalEvent: created\n"); + BOOL valid10 = ((BOOL(*)(id,SEL))objc_msgSend)(chain10, @selector(validate)); + printf(" validate: %s\n", valid10 ? "YES" : "NO"); + } else { + printf(" ChainingRequest with signalEvent: nil\n"); + } + } + } @catch (NSException *ex) { + printf(" Shared events EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + } + } + R.phase10_done = true; + + // Benchmark all eval paths side-by-side for final comparison + printf("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + printf(" EVAL PATH COMPARISON (side-by-side)\n"); + printf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"); + { + int BENCH_ITERS = 200; + + for (int w = 0; w < 10; w++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + } + + uint64_t ts = mach_absolute_time(); + for (int i = 0; i < BENCH_ITERS; i++) { + ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)( + k1.model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req1, &e); + } + R.std_eval_ms = tb_ms(mach_absolute_time() - ts) / BENCH_ITERS; + + if (client) { + for (int w = 0; w < 10; w++) { + ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + client, @selector(evaluateRealTimeWithModel:options:request:error:), + k1.model, @{}, req1, &e); + } + ts = mach_absolute_time(); + for (int i = 0; i < BENCH_ITERS; i++) { + ((BOOL(*)(id,SEL,id,id,id,NSError**))objc_msgSend)( + client, @selector(evaluateRealTimeWithModel:options:request:error:), + k1.model, @{}, req1, &e); + } + R.rt_eval_ms = tb_ms(mach_absolute_time() - ts) / BENCH_ITERS; + + for (int w = 0; w < 10; w++) { + ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(doEvaluateDirectWithModel:options:request:qos:error:), + k1.model, @{}, req1, 21, &e); + } + ts = mach_absolute_time(); + for (int i = 0; i < BENCH_ITERS; i++) { + ((BOOL(*)(id,SEL,id,id,id,unsigned int,NSError**))objc_msgSend)( + client, @selector(doEvaluateDirectWithModel:options:request:qos:error:), + k1.model, @{}, req1, 21, &e); + } + R.direct_eval_ms = tb_ms(mach_absolute_time() - ts) / BENCH_ITERS; + } + + printf(" evaluateWithQoS (standard): %.3f ms/eval\n", R.std_eval_ms); + printf(" evaluateRealTimeWithModel: %.3f ms/eval (%.2fx)\n", + R.rt_eval_ms, R.std_eval_ms / R.rt_eval_ms); + printf(" doEvaluateDirectWithModel: %.3f ms/eval (%.2fx)\n", + R.direct_eval_ms, R.std_eval_ms / R.direct_eval_ms); + if (R.process_request_works) { + printf(" processRequest: %.3f ms/eval (%.2fx)\n", + R.process_request_ms, R.std_eval_ms / R.process_request_ms); + } + } + + // ===================================================================== + // PHASE 6: Summary + // ===================================================================== + printf("\n╔══════════════════════════════════════════════════════════╗\n"); + printf("║ PHASE 6: Results Summary ║\n"); + printf("╚══════════════════════════════════════════════════════════╝\n\n"); + + printf("┌──────────────────────────────────────────────────────────┐\n"); + printf("│ Phase 1: Class Introspection │\n"); + printf("│ Classes found: %d │\n", R.classes_found); + printf("│ Classes missing: %d │\n", R.classes_missing); + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 2: Symbol Discovery │\n"); + printf("│ inputSymbolNames: %s │\n", R.has_input_symbols ? "YES" : "NO "); + printf("│ outputSymbolNames: %s │\n", R.has_output_symbols ? "YES" : "NO "); + printf("│ programHandle: %s │\n", R.has_program_handle ? "YES" : "NO "); + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 3: IOSurface Mapping │\n"); + printf("│ Mapper works: %s │\n", R.mapper_works ? "YES" : "NO "); + printf("│ Buffer works: %s │\n", R.buffer_works ? "YES" : "NO "); + printf("│ Got symbolIndex: %s │\n", R.got_symbol_index ? "YES" : "NO "); + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 4: ChainingRequest │\n"); + printf("│ validate passed: %s │\n", R.validate_passed ? "YES" : "NO "); + printf("│ Chaining executed: %s │\n", R.chaining_executed ? "YES" : "NO "); + printf("│ Sequential: %.3f ms/pair │\n", R.sequential_ms); + if (R.chaining_executed) { + printf("│ Chained: %.3f ms/pair │\n", R.chained_ms); + printf("│ Speedup: %.2fx │\n", R.sequential_ms / R.chained_ms); + } + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 5: Alternative Paths │\n"); + printf("│ RT eval works: %s │\n", R.realtime_eval_works ? "YES" : "NO "); + printf("│ PerfStats works: %s │\n", R.perfstats_works ? "YES" : "NO "); + if (R.perfstats_works) { + printf("│ hwExecutionTime: %llu ns │\n", R.hw_exec_time_ns); + } + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 7: OutputSets with statsSurRef │\n"); + printf("│ OutputSets works: %s │\n", R.outputsets_with_stats_works ? "YES" : "NO "); + printf("│ Chaining works: %s │\n", R.chaining_with_stats_works ? "YES" : "NO "); + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 8: Disk-based _ANEModel │\n"); + printf("│ Model loads: %s │\n", R.disk_model_loads ? "YES" : "NO "); + printf("│ Has symbols: %s │\n", R.disk_model_has_symbols ? "YES" : "NO "); + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 9: processRequest │\n"); + printf("│ Works: %s │\n", R.process_request_works ? "YES" : "NO "); + if (R.process_request_works) { + printf("│ Latency: %.3f ms/eval │\n", R.process_request_ms); + } + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Phase 10: Shared Events │\n"); + printf("│ Classes exist: %s │\n", R.shared_events_exist ? "YES" : "NO "); + printf("├──────────────────────────────────────────────────────────┤\n"); + printf("│ Eval Path Comparison (200 iters) │\n"); + printf("│ Standard: %.3f ms/eval │\n", R.std_eval_ms); + printf("│ RT: %.3f ms/eval (%.2fx) │\n", R.rt_eval_ms, R.std_eval_ms / (R.rt_eval_ms > 0 ? R.rt_eval_ms : 1)); + printf("│ Direct: %.3f ms/eval (%.2fx) │\n", R.direct_eval_ms, R.std_eval_ms / (R.direct_eval_ms > 0 ? R.direct_eval_ms : 1)); + if (R.process_request_works) { + printf("│ ProcReq: %.3f ms/eval (%.2fx) │\n", R.process_request_ms, R.std_eval_ms / (R.process_request_ms > 0 ? R.process_request_ms : 1)); + } + printf("└──────────────────────────────────────────────────────────┘\n"); + + // Cleanup + NSFileManager *fm = [NSFileManager defaultManager]; + NSError *cleanupErr = nil; + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k1.model, @selector(unloadWithQoS:error:), 21, &cleanupErr); + ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k2.model, @selector(unloadWithQoS:error:), 21, &cleanupErr); + [fm removeItemAtPath:k1.tmpDir error:nil]; + [fm removeItemAtPath:k2.tmpDir error:nil]; + if (k1.ioIn) CFRelease(k1.ioIn); + if (k1.ioOut) CFRelease(k1.ioOut); + if (k2.ioIn) CFRelease(k2.ioIn); + if (k2.ioOut) CFRelease(k2.ioOut); + + printf("\n=== ChainingRequest Deep Exploration v2 complete ===\n"); + } + return 0; +} diff --git a/training/test_coreml_chaining.m b/training/test_coreml_chaining.m new file mode 100644 index 0000000..8d4e688 --- /dev/null +++ b/training/test_coreml_chaining.m @@ -0,0 +1,1003 @@ +// test_coreml_chaining.m — Experiments Q-S: CoreML-compiled model for ANE chaining +// Build: make test_coreml_chaining && ./test_coreml_chaining +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +#pragma mark - Helpers + +static void dump_class_brief(const char *name) { + Class cls = NSClassFromString([NSString stringWithUTF8String:name]); + if (!cls) { printf(" %s: NOT FOUND\n", name); return; } + unsigned int mc, ic, pc; + Method *cm = class_copyMethodList(object_getClass(cls), &mc); + Method *im = class_copyMethodList(cls, &ic); + objc_property_t *pp = class_copyPropertyList(cls, &pc); + printf(" %s: %u class, %u instance methods, %u props\n", name, mc, ic, pc); + free(cm); free(im); free(pp); +} + +static void dump_props(id obj) { + if (!obj) return; + Class cls = [obj class]; + unsigned int pc; + objc_property_t *pp = class_copyPropertyList(cls, &pc); + for (unsigned int i = 0; i < pc; i++) { + const char *pn = property_getName(pp[i]); + @try { + id v = [obj valueForKey:[NSString stringWithUTF8String:pn]]; + NSString *desc = v ? [v description] : @"nil"; + if ([desc length] > 200) + desc = [[desc substringToIndex:200] stringByAppendingString:@"..."]; + printf(" %s = %s\n", pn, [desc UTF8String]); + } @catch (NSException *ex) { + printf(" %s = \n", pn, [[ex reason] UTF8String]); + } + } + free(pp); +} + +static void list_dir(NSString *path) { + NSFileManager *fm = [NSFileManager defaultManager]; + NSDirectoryEnumerator *en = [fm enumeratorAtPath:path]; + NSString *f; + while ((f = [en nextObject])) { + NSString *full = [path stringByAppendingPathComponent:f]; + BOOL isDir; + [fm fileExistsAtPath:full isDirectory:&isDir]; + if (!isDir) { + NSDictionary *a = [fm attributesOfItemAtPath:full error:nil]; + printf(" %s (%llu bytes)\n", [f UTF8String], + [[a objectForKey:NSFileSize] unsignedLongLongValue]); + } + } +} + +static IOSurfaceRef make_surface(size_t bytes) { + return IOSurfaceCreate((__bridge CFDictionaryRef)@{ + (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1, + (id)kIOSurfaceBytesPerElement:@1, (id)kIOSurfaceBytesPerRow:@(bytes), + (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0}); +} + +#pragma mark - Main + +int main(int argc, const char *argv[]) { + (void)argc; (void)argv; + @autoreleasepool { + mach_timebase_info(&g_tb); + printf("==============================================================\n"); + printf(" Experiments Q-S: CoreML-Compiled Model Chaining\n"); + printf("==============================================================\n\n"); + + void *handle = dlopen( + "/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/" + "AppleNeuralEngine", RTLD_NOW); + if (!handle) { printf("FATAL: dlopen ANE framework failed\n"); return 1; } + + Class gAIO = NSClassFromString(@"_ANEIOSurfaceObject"); + Class gBuf = NSClassFromString(@"_ANEBuffer"); + Class gOutSets = NSClassFromString(@"_ANEIOSurfaceOutputSets"); + Class gChain = NSClassFromString(@"_ANEChainingRequest"); + Class gAR = NSClassFromString(@"_ANERequest"); + Class gANEModel = NSClassFromString(@"_ANEModel"); + id client = [NSClassFromString(@"_ANEClient") + performSelector:@selector(sharedConnection)]; + + if (!gAIO || !gBuf || !gOutSets || !gChain || !gAR || !gANEModel || !client) { + printf("FATAL: Missing ANE classes\n"); + return 1; + } + + // ================================================================= + // Experiment Q: CoreML-compile .mlpackage and extract _ANEModel + // ================================================================= + printf("==============================================================\n"); + printf(" Experiment Q: CoreML Pipeline -> _ANEModel Extraction\n"); + printf("==============================================================\n\n"); + + NSString *pkgPath = @"/tmp/ane_sram_256ch_64sp.mlpackage"; + NSFileManager *fm = [NSFileManager defaultManager]; + if (![fm fileExistsAtPath:pkgPath]) { + printf(" FATAL: %s not found.\n", [pkgPath UTF8String]); + printf(" Run: python3 scripts/gen_mlpackages.py\n"); + return 1; + } + + printf(" --- Q.1: Compile .mlpackage -> .mlmodelc ---\n"); + NSError *err = nil; + NSURL *srcURL = [NSURL fileURLWithPath:pkgPath]; + NSURL *compiledURL = [MLModel compileModelAtURL:srcURL error:&err]; + if (err || !compiledURL) { + printf(" Compile FAILED: %s\n", + err ? [[err description] UTF8String] : "nil URL"); + return 1; + } + printf(" Compiled to: %s\n", [[compiledURL path] UTF8String]); + fflush(stdout); + + printf("\n --- Q.2: .mlmodelc contents ---\n"); + list_dir([compiledURL path]); + + printf("\n --- Q.3: Load MLModel with ANE compute units ---\n"); + MLModelConfiguration *config = [[MLModelConfiguration alloc] init]; + config.computeUnits = MLComputeUnitsAll; + err = nil; + MLModel *mlModel = [MLModel modelWithContentsOfURL:compiledURL + configuration:config error:&err]; + if (err || !mlModel) { + printf(" Load FAILED: %s\n", + err ? [[err description] UTF8String] : "nil model"); + return 1; + } + printf(" MLModel loaded: %s\n", + [NSStringFromClass([mlModel class]) UTF8String]); + fflush(stdout); + + printf("\n --- Q.4: Extract internal ANE model ---\n"); + fflush(stdout); + id aneModel = nil; + id aneProgram = nil; + + NSArray *kvcKeys = @[@"proxy", @"engine", @"aneModel", @"model", + @"neuralNetworkEngine", @"aneEngine", @"compiledModel", + @"_aneModel", @"_model", @"_engine"]; + for (NSString *key in kvcKeys) { + @try { + id val = [mlModel valueForKey:key]; + if (val) { + printf(" mlModel.%s = %s (%s)\n", [key UTF8String], + [[val description] UTF8String], + [NSStringFromClass([val class]) UTF8String]); + if ([val isKindOfClass:gANEModel]) { + aneModel = val; + printf(" *** Found _ANEModel via '%s' ***\n", + [key UTF8String]); + } + } + } @catch (NSException *ex) { (void)ex; } + } + + if (!aneModel) { + printf("\n Trying deeper traversal...\n"); + for (NSString *key1 in @[@"proxy", @"engine"]) { + id l1 = nil; + @try { l1 = [mlModel valueForKey:key1]; } + @catch (NSException *ex) { continue; } + if (!l1) continue; + printf(" L1: %s -> %s\n", [key1 UTF8String], + [NSStringFromClass([l1 class]) UTF8String]); + + for (NSString *key2 in @[@"model", @"aneModel", @"engine", + @"neuralNetworkEngine", @"aneEngine", @"_model", + @"compiledModel", @"program", @"espressoModel", + @"aneProgram", @"backend"]) { + @try { + id l2 = [l1 valueForKey:key2]; + if (l2) { + printf(" L2: %s.%s -> %s (%s)\n", + [key1 UTF8String], [key2 UTF8String], + [NSStringFromClass([l2 class]) UTF8String], + [[[l2 description] substringToIndex: + MIN(100, [[l2 description] length])] + UTF8String]); + if ([l2 isKindOfClass:gANEModel]) { + aneModel = l2; + printf(" *** Found _ANEModel via %s.%s ***\n", + [key1 UTF8String], [key2 UTF8String]); + } + + for (NSString *key3 in @[@"model", @"aneModel", + @"program", @"compiledModel", @"_model"]) { + @try { + id l3 = [l2 valueForKey:key3]; + if (l3) { + printf(" L3: %s.%s.%s -> %s\n", + [key1 UTF8String], + [key2 UTF8String], + [key3 UTF8String], + [NSStringFromClass([l3 class]) + UTF8String]); + if ([l3 isKindOfClass:gANEModel]) { + aneModel = l3; + printf(" *** Found _ANEModel ***\n"); + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + } + + if (!aneModel) { + printf("\n Trying _ANEClient.loadModel: with .mlmodelc ---\n"); + @try { + NSURL *espressoURL = compiledURL; + NSString *espressoKey = [[compiledURL path] lastPathComponent]; + id diskModel = ((id(*)(Class,SEL,id,id))objc_msgSend)( + gANEModel, @selector(modelAtURL:key:), + espressoURL, espressoKey); + if (diskModel) { + printf(" _ANEModel from mlmodelc: %s\n", + [[diskModel description] UTF8String]); + dump_props(diskModel); + + printf("\n Loading via _ANEClient...\n"); + SEL loadSel = NSSelectorFromString( + @"loadModel:options:qos:error:"); + NSError *loadErr = nil; + BOOL loadOk = ((BOOL(*)(id,SEL,id,id,unsigned int, + NSError**))objc_msgSend)(client, loadSel, + diskModel, @{}, (unsigned int)21, &loadErr); + printf(" loadModel: %s\n", loadOk ? "YES" : "NO"); + if (loadErr) + printf(" Error: %s\n", + [[loadErr description] UTF8String]); + + if (loadOk) { + aneModel = diskModel; + printf(" *** _ANEModel LOADED via client! ***\n"); + } else { + SEL compileSel = NSSelectorFromString( + @"compileModel:options:qos:error:"); + NSError *compErr = nil; + BOOL compOk = ((BOOL(*)(id,SEL,id,id,unsigned int, + NSError**))objc_msgSend)(client, compileSel, + diskModel, @{}, (unsigned int)21, &compErr); + printf(" compileModel: %s\n", compOk ? "YES" : "NO"); + if (compErr) + printf(" Error: %s\n", + [[compErr description] UTF8String]); + if (compOk) { + aneModel = diskModel; + printf(" *** _ANEModel COMPILED via client! ***\n"); + } + } + + dump_props(diskModel); + } + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + printf("\n --- Q.5: Inspect extracted _ANEModel ---\n"); + if (aneModel) { + printf(" _ANEModel class: %s\n", + [NSStringFromClass([aneModel class]) UTF8String]); + dump_props(aneModel); + + printf("\n Symbol indices:\n"); + @try { + SEL inSel = NSSelectorFromString( + @"inputSymbolIndicesForProcedureIndex:"); + id inSyms = ((id(*)(id,SEL,unsigned int))objc_msgSend)( + aneModel, inSel, (unsigned int)0); + printf(" inputSymbols(0): %s\n", + inSyms ? [[inSyms description] UTF8String] : "nil"); + SEL outSel = NSSelectorFromString( + @"outputSymbolIndicesForProcedureIndex:"); + id outSyms = ((id(*)(id,SEL,unsigned int))objc_msgSend)( + aneModel, outSel, (unsigned int)0); + printf(" outputSymbols(0): %s\n", + outSyms ? [[outSyms description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" Symbol EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + @try { + SEL piSel = NSSelectorFromString( + @"procedureInfoForProcedureIndex:"); + id pInfo = ((id(*)(id,SEL,unsigned int))objc_msgSend)( + aneModel, piSel, (unsigned int)0); + printf(" procedureInfo(0): %s\n", + pInfo ? [[pInfo description] UTF8String] : "nil"); + if (pInfo) { + printf(" procedureInfo class: %s\n", + [NSStringFromClass([pInfo class]) UTF8String]); + dump_props(pInfo); + } + } @catch (NSException *ex) { + printf(" procedureInfo EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + @try { + id mapper = [aneModel valueForKey:@"mapper"]; + printf(" mapper: %s\n", + mapper ? [[mapper description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" mapper EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + aneProgram = nil; + @try { + aneProgram = [aneModel valueForKey:@"program"]; + printf(" program: %s\n", + aneProgram ? [[aneProgram description] UTF8String] + : "nil"); + } @catch (NSException *ex) { + printf(" program EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + @try { + id ph = [aneModel valueForKey:@"programHandle"]; + printf(" programHandle: %s\n", + ph ? [[ph description] UTF8String] : "nil"); + } @catch (NSException *ex) { (void)ex; } + + @try { + id st = [aneModel valueForKey:@"state"]; + printf(" state: %s\n", + st ? [[st description] UTF8String] : "nil"); + } @catch (NSException *ex) { (void)ex; } + + @try { + id uuid = ((id(*)(id,SEL))objc_msgSend)( + aneModel, @selector(getUUID)); + printf(" getUUID: %s\n", + uuid ? [[uuid description] UTF8String] : "nil"); + } @catch (NSException *ex) { + printf(" getUUID EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } else { + printf(" No _ANEModel extracted. Trying alternative approaches...\n"); + + printf("\n --- Q.5b: Deep ivar traversal ---\n"); + fflush(stdout); + + id e5Engine = nil; + @try { + e5Engine = [mlModel valueForKey:@"_internalEngine"]; + } @catch (NSException *ex) { (void)ex; } + if (!e5Engine) { + @try { + Ivar ivar = class_getInstanceVariable( + [mlModel class], "_internalEngine"); + if (ivar) e5Engine = object_getIvar(mlModel, ivar); + } @catch (NSException *ex) { (void)ex; } + } + + id savedOpPool = nil; + if (e5Engine) { + printf(" MLE5Engine: %s\n", + [NSStringFromClass([e5Engine class]) UTF8String]); + fflush(stdout); + + Class e5Cls = [e5Engine class]; + while (e5Cls && e5Cls != [NSObject class]) { + printf("\n --- Ivars of %s ---\n", + [NSStringFromClass(e5Cls) UTF8String]); + fflush(stdout); + unsigned int ic; + Ivar *ivars = class_copyIvarList(e5Cls, &ic); + for (unsigned int i = 0; i < ic; i++) { + const char *name = ivar_getName(ivars[i]); + const char *type = ivar_getTypeEncoding(ivars[i]); + printf(" ivar: %s type: %s\n", name, + type ? type : "?"); + fflush(stdout); + if (type && type[0] == '@') { + @try { + id val = object_getIvar(e5Engine, ivars[i]); + if (val) { + printf(" -> %s\n", + [NSStringFromClass([val class]) + UTF8String]); + fflush(stdout); + if ([val isKindOfClass:gANEModel]) { + aneModel = val; + printf(" *** FOUND _ANEModel" + " in MLE5Engine ***\n"); + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + free(ivars); + e5Cls = class_getSuperclass(e5Cls); + } + + if (!aneModel) { + printf("\n --- Deep traversal: _programLibrary" + " and _operationPool ---\n"); + fflush(stdout); + + id targets[] = {nil, nil}; + const char *tNames[] = {"programLibrary", "operationPool"}; + @try { + targets[0] = [e5Engine valueForKey:@"programLibrary"]; + } @catch (NSException *ex) { (void)ex; } + @try { + targets[1] = [e5Engine valueForKey:@"operationPool"]; + } @catch (NSException *ex) { (void)ex; } + + for (int ti = 0; ti < 2; ti++) { + if (!targets[ti]) continue; + printf("\n [%s] %s\n", tNames[ti], + [NSStringFromClass([targets[ti] class]) + UTF8String]); + fflush(stdout); + + Class tCls = [targets[ti] class]; + while (tCls && tCls != [NSObject class]) { + unsigned int tic; + Ivar *tivars = class_copyIvarList(tCls, &tic); + for (unsigned int j = 0; j < tic; j++) { + const char *tn = ivar_getName(tivars[j]); + const char *tt = ivar_getTypeEncoding(tivars[j]); + printf(" ivar: %s type: %s\n", tn, + tt ? tt : "?"); + fflush(stdout); + if (tt && tt[0] == '@') { + @try { + id tv = object_getIvar(targets[ti], + tivars[j]); + if (tv) { + NSString *cls = NSStringFromClass( + [tv class]); + printf(" -> %s\n", + [cls UTF8String]); + fflush(stdout); + if ([tv isKindOfClass:gANEModel]) { + aneModel = tv; + printf(" *** FOUND" + " _ANEModel ***\n"); + } + + if ([cls containsString:@"ANE"] + || [cls containsString:@"ane"] + || [cls containsString:@"Plan"] + || [cls containsString:@"Program"] + || [cls containsString:@"Segment"]) { + printf(" Digging into" + " %s...\n", + [cls UTF8String]); + unsigned int sc; + Ivar *sivars = class_copyIvarList( + [tv class], &sc); + for (unsigned int si = 0; + si < sc && si < 30; si++) { + const char *sn = ivar_getName(sivars[si]); + const char *st = ivar_getTypeEncoding(sivars[si]); + printf(" .%s type=%s\n", + sn, st ? st : "?"); + if (st && st[0] == '@') { + @try { + id sv = object_getIvar(tv, sivars[si]); + if (sv) { + printf(" -> %s\n", + [NSStringFromClass([sv class]) UTF8String]); + if ([sv isKindOfClass:gANEModel]) { + aneModel = sv; + printf(" *** FOUND _ANEModel ***\n"); + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + free(sivars); + } + + if ([tv isKindOfClass:[NSDictionary class]]) { + NSDictionary *d = (NSDictionary *)tv; + printf(" dict keys: %s\n", + [[[d allKeys] description] UTF8String]); + for (id key in d) { + id dv = d[key]; + printf(" [%s] -> %s\n", + [[key description] UTF8String], + [NSStringFromClass([dv class]) UTF8String]); + fflush(stdout); + if ([dv isKindOfClass:gANEModel]) { + aneModel = dv; + printf(" *** FOUND _ANEModel ***\n"); + } + unsigned int dc; + Ivar *divars = class_copyIvarList([dv class], &dc); + for (unsigned int di = 0; di < dc && di < 20; di++) { + const char *dn = ivar_getName(divars[di]); + const char *dt = ivar_getTypeEncoding(divars[di]); + if (dt && dt[0] == '@') { + @try { + id ddv = object_getIvar(dv, divars[di]); + if (ddv && [ddv isKindOfClass:gANEModel]) { + aneModel = ddv; + printf(" *** FOUND _ANEModel in dict val ivar %s ***\n", dn); + } else if (ddv) { + NSString *dcls = NSStringFromClass([ddv class]); + if ([dcls containsString:@"ANE"]) + printf(" .%s -> %s\n", dn, [dcls UTF8String]); + } + } @catch (NSException *ex) { (void)ex; } + } + } + free(divars); + } + } + + if ([tv isKindOfClass:[NSArray class]]) { + NSArray *arr = (NSArray *)tv; + printf(" array count: %lu\n", + (unsigned long)[arr count]); + for (NSUInteger ai = 0; + ai < [arr count] && ai < 5; ai++) { + id av = arr[ai]; + printf(" [%lu] -> %s\n", + (unsigned long)ai, + [NSStringFromClass([av class]) UTF8String]); + if ([av isKindOfClass:gANEModel]) { + aneModel = av; + printf(" *** FOUND _ANEModel ***\n"); + } + } + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + free(tivars); + tCls = class_getSuperclass(tCls); + } + } + } + @try { + savedOpPool = [e5Engine valueForKey:@"operationPool"]; + } @catch (NSException *ex) { (void)ex; } + if (!aneModel) { + printf("\n --- Traversal: _pool in operationPool ---\n"); + fflush(stdout); + @try { + id pool = [savedOpPool valueForKey:@"pool"]; + if (pool && [pool isKindOfClass:[NSSet class]]) { + NSSet *s = (NSSet *)pool; + printf(" pool count: %lu\n", + (unsigned long)[s count]); + for (id item in s) { + printf(" item: %s\n", + [NSStringFromClass([item class]) + UTF8String]); + fflush(stdout); + if ([item isKindOfClass:gANEModel]) { + aneModel = item; + printf(" *** FOUND _ANEModel ***\n"); + } + unsigned int pic; + Ivar *pivars = class_copyIvarList( + [item class], &pic); + for (unsigned int pi = 0; + pi < pic && pi < 30; pi++) { + const char *pn = ivar_getName(pivars[pi]); + const char *pt = ivar_getTypeEncoding(pivars[pi]); + printf(" .%s type=%s\n", pn, + pt ? pt : "?"); + if (pt && pt[0] == '@') { + @try { + id pv = object_getIvar( + item, pivars[pi]); + if (pv) { + NSString *pcls = + NSStringFromClass([pv class]); + printf(" -> %s\n", + [pcls UTF8String]); + if ([pv isKindOfClass:gANEModel]) { + aneModel = pv; + printf(" *** FOUND" + " _ANEModel ***\n"); + } + if ([pcls containsString:@"ANE"] + || [pcls containsString:@"Plan"] + || [pcls containsString:@"Program"] + || [pcls containsString:@"Stream"]) { + unsigned int sic; + Ivar *sivar = class_copyIvarList( + [pv class], &sic); + for (unsigned int si = 0; + si < sic && si < 20; si++) { + const char *sn = ivar_getName(sivar[si]); + const char *st2 = ivar_getTypeEncoding(sivar[si]); + printf(" .%s type=%s\n", + sn, st2 ? st2 : "?"); + if (st2 && st2[0] == '@') { + @try { + id sv = object_getIvar(pv, sivar[si]); + if (sv) { + printf(" -> %s\n", + [NSStringFromClass([sv class]) UTF8String]); + if ([sv isKindOfClass:gANEModel]) { + aneModel = sv; + printf(" *** FOUND _ANEModel ***\n"); + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + free(sivar); + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + free(pivars); + } + } + } @catch (NSException *ex) { + printf(" Pool EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + if (!aneModel) { + printf("\n --- Force prediction to trigger ANE load ---\n"); + fflush(stdout); + @try { + MLModelDescription *desc = [mlModel modelDescription]; + NSDictionary *inputs = [desc inputDescriptionsByName]; + printf(" Input features:\n"); + for (NSString *name in inputs) { + MLFeatureDescription *fd = inputs[name]; + printf(" %s: type=%ld\n", [name UTF8String], + (long)fd.type); + } + + NSString *inputName = [[inputs allKeys] firstObject]; + if (inputName) { + MLMultiArray *arr = [[MLMultiArray alloc] + initWithShape:@[@1, @256, @1, @64] + dataType:MLMultiArrayDataTypeFloat32 + error:nil]; + MLDictionaryFeatureProvider *fp = + [[MLDictionaryFeatureProvider alloc] + initWithDictionary:@{inputName: arr} + error:nil]; + NSError *predErr = nil; + id result = [mlModel predictionFromFeatures:fp + error:&predErr]; + printf(" Prediction: %s\n", + result ? "SUCCESS" : "FAILED"); + if (predErr) + printf(" Error: %s\n", + [[predErr description] UTF8String]); + fflush(stdout); + } + + @try { + id pool2 = [savedOpPool valueForKey:@"pool"]; + if (pool2 && [pool2 isKindOfClass:[NSSet class]]) { + printf("\n Pool after prediction (count=%lu):\n", + (unsigned long)[(NSSet *)pool2 count]); + for (id item in (NSSet *)pool2) { + printf(" %s\n", + [NSStringFromClass([item class]) + UTF8String]); + fflush(stdout); + unsigned int pic; + Ivar *pivars = class_copyIvarList( + [item class], &pic); + for (unsigned int pi = 0; + pi < pic && pi < 30; pi++) { + const char *pn = ivar_getName(pivars[pi]); + const char *pt = ivar_getTypeEncoding(pivars[pi]); + if (pt && pt[0] == '@') { + @try { + id pv = object_getIvar(item, pivars[pi]); + if (pv) { + printf(" .%s -> %s\n", pn, + [NSStringFromClass([pv class]) UTF8String]); + if ([pv isKindOfClass:gANEModel]) { + aneModel = pv; + printf(" *** FOUND _ANEModel ***\n"); + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + free(pivars); + } + } + } @catch (NSException *ex) { (void)ex; } + } @catch (NSException *ex) { + printf(" Prediction EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + } else { + printf(" MLE5Engine: NOT FOUND\n"); + } + } + + // ================================================================= + // Experiment R: Chaining with CoreML-loaded model + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment R: Chaining with CoreML-loaded _ANEModel\n"); + printf("==============================================================\n\n"); + + BOOL chainingSuccess = NO; + + if (!aneModel) { + printf(" SKIPPED: no _ANEModel available\n"); + } else { + int ch = 256, sp = 64; + size_t bufSize = (size_t)ch * sp * 4; + IOSurfaceRef ioIn = make_surface(bufSize); + IOSurfaceRef ioOut = make_surface(bufSize); + + printf(" --- R.1: Baseline eval via _ANERequest ---\n"); + @try { + id ioObjIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), ioIn); + id ioObjOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), ioOut); + id req = ((id(*)(Class,SEL,id,id,id,id,id,id,id)) + objc_msgSend)(gAR, + @selector(requestWithInputs:inputIndices:outputs: + outputIndices:weightsBuffer:perfStats:procedureIndex:), + @[ioObjIn], @[@0], @[ioObjOut], @[@0], nil, nil, @0); + + if (req) { + NSError *evalErr = nil; + BOOL evalOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(evaluateWithModel:options:request:qos:error:), + aneModel, @{}, req, (unsigned int)21, &evalErr); + printf(" Single eval: %s\n", evalOk ? "YES" : "NO"); + if (evalErr) + printf(" Error: %s\n", + [[evalErr description] UTF8String]); + + if (evalOk) { + int niters = 100; + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < niters; i++) { + ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(evaluateWithModel:options:request: + qos:error:), + aneModel, @{}, req, (unsigned int)21, nil); + } + double elapsed = tb_ms(mach_absolute_time() - t0); + printf(" Baseline: %d iters in %.3f ms (%.4f ms/eval)\n", + niters, elapsed, elapsed / niters); + } + } + } @catch (NSException *ex) { + printf(" Baseline EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- R.2: ChainingRequest with nil params ---\n"); + @try { + id ioObjIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), ioIn); + id inBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + ioObjIn, @0, (long long)0); + + id ioObjOut = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), ioOut); + id outBuf = ((id(*)(Class,SEL,id,id,long long))objc_msgSend)( + gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex:source:), + ioObjOut, @0, (long long)1); + + IOSurfaceRef statsSurf = make_surface(64); + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id))objc_msgSend)( + gOutSets, @selector(objectWithstatsSurRef:outputBuffer:), + statsSurf, @[outBuf]); + + id cr = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId:procedureIndex: + signalEvents:transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[inBuf], @[outSet], nil, nil, nil, + @[], @0, @0, @0); + + if (!cr) { + printf(" ChainingRequest: nil\n"); + } else { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)( + cr, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + printf(" desc: %.200s\n", + [[cr description] UTF8String]); + + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(prepareChainingWithModel:options: + chainingReq:qos:error:), + aneModel, @{}, cr, (unsigned int)21, &prepErr); + printf(" prepareChainingWithModel: %s\n", + prepOk ? "YES" : "NO"); + if (prepErr) + printf(" Error: %s\n", + [[prepErr description] UTF8String]); + + if (prepOk) { + chainingSuccess = YES; + printf(" *** PREPARE SUCCEEDED! ***\n"); + + printf("\n --- R.3: enqueueSetsWithModel ---\n"); + @try { + SEL eqSel = NSSelectorFromString( + @"enqueueSetsWithModel:outputSet:" + "options:qos:error:"); + NSError *eqErr = nil; + BOOL eqOk = ((BOOL(*)(id,SEL,id,id,id, + unsigned int,NSError**))objc_msgSend)( + client, eqSel, aneModel, outSet, @{}, + (unsigned int)21, &eqErr); + printf(" enqueueSets: %s\n", + eqOk ? "YES" : "NO"); + if (eqErr) + printf(" Error: %s\n", + [[eqErr description] UTF8String]); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + printf("\n --- R.4: buffersReadyWithModel ---\n"); + @try { + SEL brSel = NSSelectorFromString( + @"buffersReadyWithModel:inputBuffers:" + "options:qos:error:"); + NSError *brErr = nil; + BOOL brOk = ((BOOL(*)(id,SEL,id,id,id, + unsigned int,NSError**))objc_msgSend)( + client, brSel, aneModel, @[inBuf], @{}, + (unsigned int)21, &brErr); + printf(" buffersReady: %s\n", + brOk ? "YES" : "NO"); + if (brErr) + printf(" Error: %s\n", + [[brErr description] UTF8String]); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } + CFRelease(statsSurf); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + if (!chainingSuccess) { + printf("\n --- R.2b: Try with symbol indices ---\n"); + @try { + SEL inSel = NSSelectorFromString( + @"inputSymbolIndicesForProcedureIndex:"); + id inSyms = ((id(*)(id,SEL,unsigned int))objc_msgSend)( + aneModel, inSel, (unsigned int)0); + SEL outSel = NSSelectorFromString( + @"outputSymbolIndicesForProcedureIndex:"); + id outSyms = ((id(*)(id,SEL,unsigned int))objc_msgSend)( + aneModel, outSel, (unsigned int)0); + printf(" inputSymbols: %s\n", + inSyms ? [[inSyms description] UTF8String] : "nil"); + printf(" outputSymbols: %s\n", + outSyms ? [[outSyms description] UTF8String] : "nil"); + + NSUInteger firstIn = 0, firstOut = 0; + if (inSyms && [inSyms isKindOfClass:[NSIndexSet class]] + && [(NSIndexSet *)inSyms count] > 0) + firstIn = [(NSIndexSet *)inSyms firstIndex]; + if (outSyms && [outSyms isKindOfClass:[NSIndexSet class]] + && [(NSIndexSet *)outSyms count] > 0) + firstOut = [(NSIndexSet *)outSyms firstIndex]; + + printf(" Using symbolIndex: in=%lu out=%lu\n", + (unsigned long)firstIn, (unsigned long)firstOut); + + id ioObjIn = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), ioIn); + id inBuf = ((id(*)(Class,SEL,id,id,long long)) + objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex: + source:), + ioObjIn, @(firstIn), (long long)0); + + id ioObjOut = ((id(*)(Class,SEL,IOSurfaceRef)) + objc_msgSend)( + gAIO, @selector(objectWithIOSurface:), ioOut); + id outBuf = ((id(*)(Class,SEL,id,id,long long)) + objc_msgSend)(gBuf, + @selector(bufferWithIOSurfaceObject:symbolIndex: + source:), + ioObjOut, @(firstOut), (long long)1); + + IOSurfaceRef statsSurf = make_surface(64); + id outSet = ((id(*)(Class,SEL,IOSurfaceRef,id)) + objc_msgSend)(gOutSets, + @selector(objectWithstatsSurRef:outputBuffer:), + statsSurf, @[outBuf]); + + id cr = ((id(*)(Class,SEL,id,id,id,id,id,id,id,id,id)) + objc_msgSend)(gChain, + @selector(chainingRequestWithInputs:outputSets: + lbInputSymbolId:lbOutputSymbolId:procedureIndex: + signalEvents:transactionHandle:fwEnqueueDelay: + memoryPoolId:), + @[inBuf], @[outSet], nil, nil, nil, + @[], @0, @0, @0); + + if (cr) { + BOOL valid = ((BOOL(*)(id,SEL))objc_msgSend)( + cr, @selector(validate)); + printf(" validate: %s\n", valid ? "YES" : "NO"); + + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,id,id,id,unsigned int, + NSError**))objc_msgSend)(client, + @selector(prepareChainingWithModel:options: + chainingReq:qos:error:), + aneModel, @{}, cr, (unsigned int)21, &prepErr); + printf(" prepare (with symbols): %s\n", + prepOk ? "YES" : "NO"); + if (prepErr) + printf(" Error: %s\n", + [[prepErr description] UTF8String]); + if (prepOk) { + chainingSuccess = YES; + printf(" *** PREPARE SUCCEEDED WITH SYMBOLS! ***\n"); + } + } + CFRelease(statsSurf); + } @catch (NSException *ex) { + printf(" EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + } + + CFRelease(ioIn); + CFRelease(ioOut); + } + + // ================================================================= + // Experiment S: Two-kernel chaining (if R succeeded) + // ================================================================= + printf("\n==============================================================\n"); + printf(" Experiment S: Two-Kernel Chaining\n"); + printf("==============================================================\n\n"); + if (!chainingSuccess) { + printf(" SKIPPED: prepareChainingWithModel not yet working\n"); + printf(" (Requires success in Experiment R first)\n"); + } else { + printf(" TODO: implement two-kernel chaining pipeline\n"); + } + + // ================================================================= + // Summary + // ================================================================= + printf("\n============================================================\n"); + printf(" RESULTS SUMMARY\n"); + printf("============================================================\n"); + printf(" Exp Q: CoreML pipeline: %s\n", + aneModel ? "MODEL EXTRACTED" : "NO MODEL"); + printf(" Exp R: Chaining: %s\n", + chainingSuccess ? "SUCCESS" : "not yet"); + printf(" Exp S: Multi-kernel: %s\n", + chainingSuccess ? "TODO" : "BLOCKED"); + printf("============================================================\n"); + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_e5_validate.m b/training/test_e5_validate.m new file mode 100644 index 0000000..128090a --- /dev/null +++ b/training/test_e5_validate.m @@ -0,0 +1,817 @@ +// test_e5_validate.m — Experiments W1-W5: E5 Runtime Validation & Deep API Exploration +// Build: make test_e5_validate && ./test_e5_validate +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +#pragma mark - Helpers + +static void dump_all_methods(Class cls, const char *label) { + if (!cls) { printf(" %s: NOT FOUND\n", label); return; } + printf("\n--- %s ---\n", label); + + unsigned int mc; + Method *cm = class_copyMethodList(object_getClass(cls), &mc); + if (mc > 0) { + printf(" Class methods (%u):\n", mc); + for (unsigned int i = 0; i < mc; i++) { + const char *sel = sel_getName(method_getName(cm[i])); + const char *enc = method_getTypeEncoding(cm[i]); + printf(" + %s [%s]\n", sel, enc ? enc : "?"); + } + } + free(cm); + + Method *im = class_copyMethodList(cls, &mc); + if (mc > 0) { + printf(" Instance methods (%u):\n", mc); + for (unsigned int i = 0; i < mc; i++) { + const char *sel = sel_getName(method_getName(im[i])); + const char *enc = method_getTypeEncoding(im[i]); + printf(" - %s [%s]\n", sel, enc ? enc : "?"); + } + } + free(im); + + unsigned int pc; + objc_property_t *props = class_copyPropertyList(cls, &pc); + if (pc > 0) { + printf(" Properties (%u):\n", pc); + for (unsigned int i = 0; i < pc; i++) + printf(" %s [%s]\n", property_getName(props[i]), + property_getAttributes(props[i])); + } + free(props); + + unsigned int ic; + Ivar *ivars = class_copyIvarList(cls, &ic); + if (ic > 0) { + printf(" Ivars (%u):\n", ic); + for (unsigned int i = 0; i < ic; i++) { + const char *n = ivar_getName(ivars[i]); + const char *t = ivar_getTypeEncoding(ivars[i]); + printf(" %s type=%s\n", n, t ? t : "?"); + } + } + free(ivars); + + Class super = class_getSuperclass(cls); + if (super && super != [NSObject class]) + printf(" Superclass: %s\n", class_getName(super)); +} + +static float max_abs_diff(float *a, float *b, int n) { + float m = 0; + for (int i = 0; i < n; i++) { + float d = fabsf(a[i] - b[i]); + if (d > m) m = d; + } + return m; +} + +static float mean_abs(float *a, int n) { + float s = 0; + for (int i = 0; i < n; i++) s += fabsf(a[i]); + return s / n; +} + +#pragma mark - Main + +int main(int argc, const char *argv[]) { + (void)argc; (void)argv; + @autoreleasepool { + mach_timebase_info(&g_tb); + printf("================================================================\n"); + printf(" E5 Runtime: Validation & Exhaustive API Documentation\n"); + printf("================================================================\n\n"); + + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/" + "AppleNeuralEngine", RTLD_NOW); + + // ============================================================ + // W2: Exhaustive API Documentation (dump first so we have it) + // ============================================================ + printf("================================================================\n"); + printf(" W2: Exhaustive E5 Runtime API Documentation\n"); + printf("================================================================\n"); + + const char *classNames[] = { + "MLE5Engine", + "MLE5ProgramLibrary", + "MLE5ProgramLibraryOnDeviceAOTCompilationImpl", + "MLE5ProgramLibraryE5BundleImpl", + "MLE5ExecutionStreamOperation", + "MLE5ExecutionStream", + "MLE5ExecutionStreamPool", + "MLE5StaticShapeExecutionStreamOperationPool", + "MLE5RangeShapeExecutionStreamOperationPool", + "MLE5EnumeratedShapeExecutionStreamOperationPool", + "MLE5ExecutionStreamOperationPoolFactory", + "MLE5InputPort", + "MLE5OutputPort", + "MLE5InputPortBinder", + "MLE5OutputPortBinder", + "MLProgramE5Container", + NULL + }; + for (int i = 0; classNames[i]; i++) { + Class cls = NSClassFromString( + [NSString stringWithUTF8String:classNames[i]]); + dump_all_methods(cls, classNames[i]); + } + + printf("\n--- e5rt_* C API Symbols ---\n"); + const char *cFuncs[] = { + "e5rt_program_library_create", + "e5rt_program_library_destroy", + "e5rt_program_library_compile", + "e5rt_program_library_get_function", + "e5rt_program_library_load_function", + "e5rt_execution_stream_create", + "e5rt_execution_stream_destroy", + "e5rt_execution_stream_submit", + "e5rt_execution_stream_wait", + "e5rt_execution_stream_execute", + "e5rt_execution_stream_sync", + "e5rt_execution_stream_operation_create", + "e5rt_execution_stream_operation_destroy", + "e5rt_execution_stream_operation_set_input", + "e5rt_execution_stream_operation_set_output", + "e5rt_execution_stream_operation_execute", + "e5rt_async_event_create", + "e5rt_async_event_destroy", + "e5rt_async_event_signal", + "e5rt_async_event_wait", + "e5rt_buffer_create", + "e5rt_buffer_destroy", + "e5rt_io_port_create", + "e5rt_io_port_bind", + "e5rt_context_create", + "e5rt_init", + "e5rt_get_version", + NULL + }; + for (int i = 0; cFuncs[i]; i++) { + void *sym = dlsym(RTLD_DEFAULT, cFuncs[i]); + if (sym) printf(" FOUND: %s at %p\n", cFuncs[i], sym); + } + fflush(stdout); + + // ============================================================ + // W1: Output Validation + // ============================================================ + printf("\n================================================================\n"); + printf(" W1: Output Correctness Validation\n"); + printf("================================================================\n\n"); + + int ch = 256, sp = 64; + NSString *pkgPath = [NSString stringWithFormat: + @"/tmp/ane_sram_%dch_%dsp.mlpackage", ch, sp]; + if (![[NSFileManager defaultManager] fileExistsAtPath:pkgPath]) { + printf(" FATAL: %s not found. Run gen_mlpackages.py\n", + [pkgPath UTF8String]); + return 1; + } + + NSError *err = nil; + MLModelConfiguration *cfg = [[MLModelConfiguration alloc] init]; + cfg.computeUnits = MLComputeUnitsAll; + MLPredictionOptions *predOpts = [[MLPredictionOptions alloc] init]; + Class opCls = NSClassFromString(@"MLE5ExecutionStreamOperation"); + + NSURL *compiled = [MLModel compileModelAtURL: + [NSURL fileURLWithPath:pkgPath] error:&err]; + if (err) { printf(" Compile FAILED\n"); return 1; } + err = nil; + MLModel *model = [MLModel modelWithContentsOfURL:compiled + configuration:cfg error:&err]; + if (err) { printf(" Load FAILED\n"); return 1; } + + int nElems = 1 * ch * 1 * sp; + MLMultiArray *inputArr = [[MLMultiArray alloc] + initWithShape:@[@1, @(ch), @1, @(sp)] + dataType:MLMultiArrayDataTypeFloat32 error:nil]; + + float *inPtr = (float *)[inputArr dataPointer]; + for (int i = 0; i < nElems; i++) + inPtr[i] = sinf((float)i * 0.01f) * 0.5f; + + NSString *inName = [[[[model modelDescription] inputDescriptionsByName] + allKeys] firstObject]; + NSString *outName = [[[[model modelDescription] outputDescriptionsByName] + allKeys] firstObject]; + MLDictionaryFeatureProvider *fp = [[MLDictionaryFeatureProvider alloc] + initWithDictionary:@{inName: inputArr} error:nil]; + + printf(" Input: %s [1,%d,1,%d], first 5: [%.4f %.4f %.4f %.4f %.4f]\n", + [inName UTF8String], ch, sp, + inPtr[0], inPtr[1], inPtr[2], inPtr[3], inPtr[4]); + printf(" Output: %s\n", [outName UTF8String]); + fflush(stdout); + + // --- Reference: CoreML sequential prediction --- + printf("\n --- W1.1: CoreML reference prediction ---\n"); + err = nil; + id refResult = [model predictionFromFeatures:fp error:&err]; + if (err) { printf(" Prediction FAILED\n"); return 1; } + + MLMultiArray *refOut = [refResult featureValueForName:outName].multiArrayValue; + float *refPtr = (float *)[refOut dataPointer]; + int outElems = 1; + for (int d = 0; d < (int)refOut.shape.count; d++) + outElems *= [refOut.shape[d] intValue]; + printf(" Output shape: ["); + for (int d = 0; d < (int)refOut.shape.count; d++) + printf("%s%d", d ? "," : "", [refOut.shape[d] intValue]); + printf("] (%d elements)\n", outElems); + printf(" First 5 ref: [%.6f %.6f %.6f %.6f %.6f]\n", + refPtr[0], refPtr[1], refPtr[2], refPtr[3], refPtr[4]); + printf(" Mean |ref|: %.6f\n", mean_abs(refPtr, outElems)); + fflush(stdout); + + // --- E5 stream prediction --- + printf("\n --- W1.2: E5 stream prediction ---\n"); + + id e5engine = nil; + @try { e5engine = [model valueForKey:@"_internalEngine"]; } + @catch (NSException *e) { (void)e; } + id progLib = nil; + @try { progLib = [e5engine valueForKey:@"programLibrary"]; } + @catch (NSException *e) { (void)e; } + id streamPool = nil; + @try { streamPool = [e5engine valueForKey:@"streamPool"]; } + @catch (NSException *e) { (void)e; } + + id op = ((id(*)(id,SEL,id,id,id,id,id,unsigned long long))objc_msgSend)( + [opCls alloc], + @selector(initWithProgramLibrary:functionName:modelDescription: + configuration:debugLabel:modelSignpostId:), + progLib, @"main", [model modelDescription], cfg, + @"validate_op", (unsigned long long)0); + + NSError *plErr = nil; + BOOL plOk = ((BOOL(*)(id,SEL,NSError**))objc_msgSend)( + op, @selector(preloadAndReturnError:), &plErr); + printf(" preload: %s\n", plOk ? "YES" : "NO"); + if (plErr) printf(" Error: %s\n", [[plErr description] UTF8String]); + fflush(stdout); + + id stream = [streamPool performSelector:@selector(takeOut)]; + Ivar shIvar = class_getInstanceVariable([stream class], "_streamHandle"); + void *sh = (__bridge void *)object_getIvar(stream, shIvar); + printf(" stream: %p, handle: %p\n", (__bridge void *)stream, sh); + + [stream setValue:@[op] forKey:@"operations"]; + + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + op, @selector(prepareForInputFeatures:options:error:), + fp, predOpts, &prepErr); + printf(" prepare: %s\n", prepOk ? "YES" : "NO"); + if (prepErr) printf(" Error: %s\n", [[prepErr description] UTF8String]); + fflush(stdout); + + NSError *execErr = nil; + BOOL execOk = ((BOOL(*)(id,SEL,void*,NSError**))objc_msgSend)( + stream, @selector(_executeStream:error:), sh, &execErr); + printf(" execute: %s\n", execOk ? "YES" : "NO"); + if (execErr) printf(" Error: %s\n", [[execErr description] UTF8String]); + fflush(stdout); + + // Read output from the operation + printf("\n --- W1.3: Read E5 output features ---\n"); + fflush(stdout); + id e5Result = nil; + @try { + e5Result = [op valueForKey:@"outputFeatures"]; + printf(" outputFeatures: %s\n", + e5Result ? [NSStringFromClass([e5Result class]) UTF8String] + : "nil"); + } @catch (NSException *ex) { + printf(" outputFeatures EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + if (e5Result && [e5Result conformsToProtocol:@protocol(MLFeatureProvider)]) { + MLMultiArray *e5Out = [(id)e5Result + featureValueForName:outName].multiArrayValue; + if (e5Out) { + float *e5Ptr = (float *)[e5Out dataPointer]; + printf(" E5 first 5: [%.6f %.6f %.6f %.6f %.6f]\n", + e5Ptr[0], e5Ptr[1], e5Ptr[2], e5Ptr[3], e5Ptr[4]); + printf(" Mean |e5|: %.6f\n", mean_abs(e5Ptr, outElems)); + + float mad = max_abs_diff(refPtr, e5Ptr, outElems); + printf(" Max abs diff: %.8f\n", mad); + printf(" Relative error: %.2e\n", + mad / (mean_abs(refPtr, outElems) + 1e-10f)); + + if (mad < 1e-3f) { + printf(" *** VALIDATION PASSED: outputs match ***\n"); + } else if (mad < 1e-1f) { + printf(" VALIDATION WARNING: small differences (FP16 expected)\n"); + } else { + printf(" VALIDATION FAILED: outputs diverge!\n"); + } + } else { + printf(" E5 output array is nil for key '%s'\n", + [outName UTF8String]); + + NSArray *ofNames = [(id)e5Result + featureNames].allObjects; + printf(" Available features: %s\n", + [[ofNames description] UTF8String]); + } + } else { + printf(" Cannot read output features\n"); + } + + // Also read output via outputPorts + printf("\n --- W1.4: Read via output ports ---\n"); + fflush(stdout); + @try { + id outPorts = [op valueForKey:@"outputPorts"]; + printf(" outputPorts: %s (count=%lu)\n", + outPorts ? [NSStringFromClass([outPorts class]) UTF8String] + : "nil", + outPorts ? (unsigned long)[(NSArray *)outPorts count] : 0); + + if (outPorts && [(NSArray *)outPorts count] > 0) { + for (NSUInteger pi = 0; pi < [(NSArray *)outPorts count]; pi++) { + id port = [(NSArray *)outPorts objectAtIndex:pi]; + printf(" Port[%lu]: %s\n", (unsigned long)pi, + [[port description] UTF8String]); + @try { + id portName = [port valueForKey:@"name"]; + printf(" name: %s\n", + portName ? [(NSString *)portName UTF8String] : "nil"); + } @catch (NSException *ex) { (void)ex; } + @try { + id portFD = [port valueForKey:@"featureDescription"]; + printf(" featureDescription: %s\n", + portFD ? [[portFD description] UTF8String] : "nil"); + } @catch (NSException *ex) { (void)ex; } + @try { + id binder = [port valueForKey:@"binder"]; + printf(" binder: %s\n", + binder ? [NSStringFromClass([binder class]) + UTF8String] : "nil"); + if (binder) { + @try { + id fv = [binder valueForKey:@"featureValue"]; + printf(" featureValue: %s\n", + fv ? [NSStringFromClass([fv class]) + UTF8String] : "nil"); + if (fv) { + MLMultiArray *ma = [(MLFeatureValue *)fv + multiArrayValue]; + if (ma) { + float *ptr = (float *)[ma dataPointer]; + printf(" first 5: [%.6f %.6f %.6f" + " %.6f %.6f]\n", + ptr[0], ptr[1], ptr[2], + ptr[3], ptr[4]); + float mad2 = max_abs_diff(refPtr, ptr, + outElems); + printf(" Max abs diff vs ref: %.8f\n", + mad2); + } + } + } @catch (NSException *ex) { + printf(" featureValue EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + } @catch (NSException *ex) { (void)ex; } + } + } + } @catch (NSException *ex) { + printf(" outputPorts EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Also read input ports + printf("\n --- W1.5: Inspect input ports ---\n"); + fflush(stdout); + @try { + id inPorts = [op valueForKey:@"inputPorts"]; + printf(" inputPorts: %s (count=%lu)\n", + inPorts ? [NSStringFromClass([inPorts class]) UTF8String] + : "nil", + inPorts ? (unsigned long)[(NSArray *)inPorts count] : 0); + if (inPorts) { + for (NSUInteger pi = 0; pi < [(NSArray *)inPorts count]; pi++) { + id port = [(NSArray *)inPorts objectAtIndex:pi]; + printf(" Port[%lu]: %s\n", (unsigned long)pi, + [[port description] UTF8String]); + @try { + printf(" name: %s\n", + [[(id)[port valueForKey:@"name"] description] + UTF8String]); + printf(" portHandle: %p\n", + (__bridge void *)[port valueForKey:@"portHandle"]); + } @catch (NSException *ex) { (void)ex; } + @try { + id binder = [port valueForKey:@"binder"]; + if (binder) { + printf(" binder: %s\n", + [NSStringFromClass([binder class]) UTF8String]); + printf(" bindingMode: %d\n", + ((char(*)(id,SEL))objc_msgSend)( + binder, @selector(bindingMode))); + id dfv = nil; + @try { + dfv = [binder valueForKey:@"directlyBoundFeatureValue"]; + } @catch (NSException *ex) { (void)ex; } + printf(" directlyBound: %s\n", + dfv ? "YES" : "NO"); + } + } @catch (NSException *ex) { (void)ex; } + } + } + } @catch (NSException *ex) { + printf(" inputPorts EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + // Return stream + [stream setValue:@[op] forKey:@"operations"]; + ((void(*)(id,SEL,id))objc_msgSend)( + streamPool, @selector(putBack:), stream); + + // ============================================================ + // W1.6: Multi-op output validation + // ============================================================ + printf("\n --- W1.6: Multi-op output validation ---\n"); + fflush(stdout); + + { + NSString *pkg2Path = @"/tmp/ane_sram_512ch_64sp.mlpackage"; + err = nil; + NSURL *c2 = [MLModel compileModelAtURL: + [NSURL fileURLWithPath:pkg2Path] error:&err]; + if (err) { printf(" Compile2 FAILED\n"); goto skip_multiop; } + err = nil; + MLModel *model2 = [MLModel modelWithContentsOfURL:c2 + configuration:cfg error:&err]; + if (err) { printf(" Load2 FAILED\n"); goto skip_multiop; } + int ch2 = 512; + int nElems2 = 1 * ch2 * 1 * sp; + MLMultiArray *inputArr2 = [[MLMultiArray alloc] + initWithShape:@[@1, @(ch2), @1, @(sp)] + dataType:MLMultiArrayDataTypeFloat32 error:nil]; + float *in2Ptr = (float *)[inputArr2 dataPointer]; + for (int i = 0; i < nElems2; i++) + in2Ptr[i] = cosf((float)i * 0.02f) * 0.3f; + + NSString *in2Name = [[[[model2 modelDescription] inputDescriptionsByName] + allKeys] firstObject]; + NSString *out2Name = [[[[model2 modelDescription] outputDescriptionsByName] + allKeys] firstObject]; + MLDictionaryFeatureProvider *fp2 = [[MLDictionaryFeatureProvider alloc] + initWithDictionary:@{in2Name: inputArr2} error:nil]; + + // Reference predictions + err = nil; + id ref1 = [model predictionFromFeatures:fp error:&err]; + err = nil; + id ref2 = [model2 predictionFromFeatures:fp2 error:&err]; + float *ref1Ptr = (float *)[[ref1 featureValueForName:outName].multiArrayValue dataPointer]; + float *ref2Ptr = (float *)[[ref2 featureValueForName:out2Name].multiArrayValue dataPointer]; + + // E5 multi-op stream + id e5_2 = nil; + @try { e5_2 = [model2 valueForKey:@"_internalEngine"]; } + @catch (NSException *e) { (void)e; } + id pLib2 = nil; + @try { pLib2 = [e5_2 valueForKey:@"programLibrary"]; } + @catch (NSException *e) { (void)e; } + + id op1 = ((id(*)(id,SEL,id,id,id,id,id,unsigned long long))objc_msgSend)( + [opCls alloc], + @selector(initWithProgramLibrary:functionName:modelDescription: + configuration:debugLabel:modelSignpostId:), + progLib, @"main", [model modelDescription], cfg, + @"val_op1", (unsigned long long)0); + id op2 = ((id(*)(id,SEL,id,id,id,id,id,unsigned long long))objc_msgSend)( + [opCls alloc], + @selector(initWithProgramLibrary:functionName:modelDescription: + configuration:debugLabel:modelSignpostId:), + pLib2, @"main", [model2 modelDescription], cfg, + @"val_op2", (unsigned long long)0); + + ((BOOL(*)(id,SEL,NSError**))objc_msgSend)(op1, @selector(preloadAndReturnError:), nil); + ((BOOL(*)(id,SEL,NSError**))objc_msgSend)(op2, @selector(preloadAndReturnError:), nil); + + id stream2 = [streamPool performSelector:@selector(takeOut)]; + Ivar shIvar2 = class_getInstanceVariable([stream2 class], "_streamHandle"); + void *sh2 = (__bridge void *)object_getIvar(stream2, shIvar2); + + [stream2 setValue:@[op1, op2] forKey:@"operations"]; + + ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + op1, @selector(prepareForInputFeatures:options:error:), + fp, predOpts, nil); + ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + op2, @selector(prepareForInputFeatures:options:error:), + fp2, predOpts, nil); + + NSError *mErr = nil; + BOOL mOk = ((BOOL(*)(id,SEL,void*,NSError**))objc_msgSend)( + stream2, @selector(_executeStream:error:), sh2, &mErr); + printf(" Multi-op execute: %s\n", mOk ? "YES" : "NO"); + if (mErr) printf(" Error: %s\n", [[mErr description] UTF8String]); + fflush(stdout); + + if (mOk) { + // Read outputs + @try { + id out1 = [op1 valueForKey:@"outputFeatures"]; + id out2 = [op2 valueForKey:@"outputFeatures"]; + + if (out1 && out2) { + MLMultiArray *ma1 = [(id)out1 + featureValueForName:outName].multiArrayValue; + MLMultiArray *ma2 = [(id)out2 + featureValueForName:out2Name].multiArrayValue; + + if (ma1 && ma2) { + float *p1 = (float *)[ma1 dataPointer]; + float *p2 = (float *)[ma2 dataPointer]; + + float mad1 = max_abs_diff(ref1Ptr, p1, outElems); + float mad2 = max_abs_diff(ref2Ptr, p2, nElems2); + + printf(" Op1 max diff: %.8f (mean_ref=%.6f)\n", + mad1, mean_abs(ref1Ptr, outElems)); + printf(" Op2 max diff: %.8f (mean_ref=%.6f)\n", + mad2, mean_abs(ref2Ptr, nElems2)); + + if (mad1 < 1e-3f && mad2 < 1e-3f) { + printf(" *** MULTI-OP VALIDATION PASSED ***\n"); + } else { + printf(" MULTI-OP VALIDATION: differences detected\n"); + } + } else { + printf(" Could not extract MLMultiArray from outputs\n"); + } + } else { + printf(" outputFeatures nil for op1 or op2\n"); + } + } @catch (NSException *ex) { + printf(" Output read EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + [stream2 setValue:@[op1] forKey:@"operations"]; + ((void(*)(id,SEL,id))objc_msgSend)( + streamPool, @selector(putBack:), stream2); + } +skip_multiop: + + // ============================================================ + // W4: Async stream submission + // ============================================================ + printf("\n================================================================\n"); + printf(" W4: Async Stream Submission\n"); + printf("================================================================\n\n"); + fflush(stdout); + + { + id asyncStream = [streamPool performSelector:@selector(takeOut)]; + Ivar ashIvar = class_getInstanceVariable([asyncStream class], "_streamHandle"); + void *ash = (__bridge void *)object_getIvar(asyncStream, ashIvar); + + id asyncOp = ((id(*)(id,SEL,id,id,id,id,id,unsigned long long)) + objc_msgSend)([opCls alloc], + @selector(initWithProgramLibrary:functionName:modelDescription: + configuration:debugLabel:modelSignpostId:), + progLib, @"main", [model modelDescription], cfg, + @"async_op", (unsigned long long)0); + ((BOOL(*)(id,SEL,NSError**))objc_msgSend)( + asyncOp, @selector(preloadAndReturnError:), nil); + [asyncStream setValue:@[asyncOp] forKey:@"operations"]; + + ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + asyncOp, @selector(prepareForInputFeatures:options:error:), + fp, predOpts, nil); + + // Try async submission + __block BOOL asyncDone = NO; + __block double asyncMs = 0; + uint64_t asyncT0 = mach_absolute_time(); + + @try { + // prepareAsyncSubmissionForInputFeatures + NSError *asyncPrepErr = nil; + BOOL asyncPrepOk = ((BOOL(*)(id,SEL,id,id,NSError**)) + objc_msgSend)(asyncStream, + @selector(prepareAsyncSubmissionForInputFeatures:options:error:), + fp, predOpts, &asyncPrepErr); + printf(" prepareAsyncSubmission: %s\n", + asyncPrepOk ? "YES" : "NO"); + if (asyncPrepErr) printf(" Error: %s\n", + [[asyncPrepErr description] UTF8String]); + fflush(stdout); + + if (asyncPrepOk) { + ((void(*)(id,SEL,void(^)(void)))objc_msgSend)( + asyncStream, @selector(submitWithCompletionHandler:), + ^{ + asyncMs = tb_ms(mach_absolute_time() - asyncT0); + asyncDone = YES; + }); + printf(" Submitted async, waiting...\n"); + fflush(stdout); + + for (int w = 0; w < 100 && !asyncDone; w++) + usleep(1000); + + printf(" Async completed: %s (%.3f ms)\n", + asyncDone ? "YES" : "TIMEOUT", asyncMs); + fflush(stdout); + + if (asyncDone) { + // Benchmark async vs sync + int N = 200; + + // Sync benchmark + uint64_t t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) { + ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + asyncOp, + @selector(prepareForInputFeatures:options:error:), + fp, predOpts, nil); + ((BOOL(*)(id,SEL,void*,NSError**))objc_msgSend)( + asyncStream, + @selector(_executeStream:error:), ash, nil); + } + double syncMs = tb_ms(mach_absolute_time() - t0) / N; + + // Async benchmark + t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) { + ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + asyncOp, + @selector(prepareForInputFeatures:options:error:), + fp, predOpts, nil); + ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + asyncStream, + @selector(prepareAsyncSubmissionForInputFeatures: + options:error:), + fp, predOpts, nil); + + __block BOOL done = NO; + ((void(*)(id,SEL,void(^)(void)))objc_msgSend)( + asyncStream, + @selector(submitWithCompletionHandler:), + ^{ done = YES; }); + while (!done) usleep(100); + } + double asyncBenchMs = tb_ms(mach_absolute_time() - t0) / N; + + printf(" Sync: %.4f ms/eval\n", syncMs); + printf(" Async (wait): %.4f ms/eval\n", asyncBenchMs); + } + } + } @catch (NSException *ex) { + printf(" Async EXCEPTION: %s\n", [[ex reason] UTF8String]); + } + + [asyncStream setValue:@[asyncOp] forKey:@"operations"]; + ((void(*)(id,SEL,id))objc_msgSend)( + streamPool, @selector(putBack:), asyncStream); + } + + // ============================================================ + // W5: Port-Based Data Flow + // ============================================================ + printf("\n================================================================\n"); + printf(" W5: Port-Based Data Flow Investigation\n"); + printf("================================================================\n\n"); + fflush(stdout); + + { + id portOp = ((id(*)(id,SEL,id,id,id,id,id,unsigned long long)) + objc_msgSend)([opCls alloc], + @selector(initWithProgramLibrary:functionName:modelDescription: + configuration:debugLabel:modelSignpostId:), + progLib, @"main", [model modelDescription], cfg, + @"port_op", (unsigned long long)0); + ((BOOL(*)(id,SEL,NSError**))objc_msgSend)( + portOp, @selector(preloadAndReturnError:), nil); + + // Inspect ports before prepare + printf(" --- Before prepare ---\n"); + @try { + id inP = [portOp valueForKey:@"inputPorts"]; + id outP = [portOp valueForKey:@"outputPorts"]; + id stP = [portOp valueForKey:@"statePorts"]; + printf(" inputPorts: %lu, outputPorts: %lu, statePorts: %lu\n", + inP ? (unsigned long)[(NSArray *)inP count] : 0, + outP ? (unsigned long)[(NSArray *)outP count] : 0, + stP ? (unsigned long)[(NSArray *)stP count] : 0); + + if (inP) { + for (id p in (NSArray *)inP) { + printf(" in: %s portHandle=%p name=%s\n", + [NSStringFromClass([p class]) UTF8String], + (__bridge void *)[p valueForKey:@"portHandle"], + [[(id)[p valueForKey:@"name"] description] UTF8String]); + } + } + if (outP) { + for (id p in (NSArray *)outP) { + printf(" out: %s portHandle=%p name=%s\n", + [NSStringFromClass([p class]) UTF8String], + (__bridge void *)[p valueForKey:@"portHandle"], + [[(id)[p valueForKey:@"name"] description] UTF8String]); + @try { + id fd = [p valueForKey:@"featureDescription"]; + if (fd) printf(" featureDesc: %s\n", + [[fd description] UTF8String]); + } @catch (NSException *ex) { (void)ex; } + } + } + } @catch (NSException *ex) { + printf(" Port inspection EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + + // Prepare and inspect after + ((BOOL(*)(id,SEL,id,id,NSError**))objc_msgSend)( + portOp, @selector(prepareForInputFeatures:options:error:), + fp, predOpts, nil); + + printf("\n --- After prepare ---\n"); + @try { + id inP = [portOp valueForKey:@"inputPorts"]; + if (inP) { + for (id p in (NSArray *)inP) { + id binder = [p valueForKey:@"binder"]; + BOOL directBound = ((BOOL(*)(id,SEL))objc_msgSend)( + p, @selector(boundFeatureDirectly)); + printf(" in: name=%s directBound=%s binder=%s\n", + [[(id)[p valueForKey:@"name"] description] UTF8String], + directBound ? "YES" : "NO", + binder ? [NSStringFromClass([binder class]) + UTF8String] : "nil"); + if (binder) { + char mode = ((char(*)(id,SEL))objc_msgSend)( + binder, @selector(bindingMode)); + printf(" bindingMode=%d\n", (int)mode); + } + } + } + id outP = [portOp valueForKey:@"outputPorts"]; + if (outP) { + for (id p in (NSArray *)outP) { + BOOL directBound = ((BOOL(*)(id,SEL))objc_msgSend)( + p, @selector(boundFeatureDirectly)); + BOOL obDirectBound = ((BOOL(*)(id,SEL))objc_msgSend)( + p, @selector(outputBackingWasDirectlyBound)); + printf(" out: name=%s directBound=%s" + " outputBackingDirectBound=%s\n", + [[(id)[p valueForKey:@"name"] description] UTF8String], + directBound ? "YES" : "NO", + obDirectBound ? "YES" : "NO"); + id binder = [p valueForKey:@"binder"]; + if (binder) { + printf(" binder: %s\n", + [NSStringFromClass([binder class]) UTF8String]); + @try { + id ob = [binder valueForKey:@"outputBacking"]; + printf(" outputBacking: %s\n", + ob ? [NSStringFromClass([ob class]) + UTF8String] : "nil"); + } @catch (NSException *ex) { (void)ex; } + } + } + } + } @catch (NSException *ex) { + printf(" Post-prepare EXCEPTION: %s\n", + [[ex reason] UTF8String]); + } + } + + // ============================================================ + // Summary + // ============================================================ + printf("\n================================================================\n"); + printf(" SUMMARY\n"); + printf("================================================================\n"); + printf(" W1: Output validation -- see above\n"); + printf(" W2: API documentation -- complete (all classes dumped)\n"); + printf(" W4: Async submission -- see above\n"); + printf(" W5: Port data flow -- see above\n"); + printf("================================================================\n"); + printf("\nDone.\n"); + } + return 0; +} diff --git a/training/test_mil_custom.m b/training/test_mil_custom.m new file mode 100644 index 0000000..b953bd5 --- /dev/null +++ b/training/test_mil_custom.m @@ -0,0 +1,915 @@ +// test_mil_custom.m — Experiments Y1-Y3, Z1: Custom MIL -> ANE Execution +// Build: make test_mil_custom && ./test_mil_custom +#import +#import +#import +#import +#import +#import +#import + +static mach_timebase_info_data_t g_tb; +static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; } + +#pragma mark - MIL Compilation Pipeline + +static id compileAndCreateEngine(NSString *milText, NSString *label, + id container, MLModelConfiguration *cfg, + MLModelDescription *desc, NSError **outErr) { + NSString *milPath = [NSString stringWithFormat:@"/tmp/%@.mil", label]; + [milText writeToFile:milPath atomically:YES encoding:NSUTF8StringEncoding error:nil]; + NSURL *milURL = [NSURL fileURLWithPath:milPath]; + + Class aotCls = NSClassFromString(@"MLE5ProgramLibraryOnDeviceAOTCompilationImpl"); + if (!aotCls) { + if (outErr) *outErr = [NSError errorWithDomain:@"MIL" code:1 + userInfo:@{NSLocalizedDescriptionKey: @"AOT class not found"}]; + return nil; + } + + id aotImpl = ((id(*)(id,SEL,id,id,id))objc_msgSend)( + [aotCls alloc], + NSSelectorFromString(@"initWithMILTextAtURL:container:configuration:"), + milURL, container, cfg); + if (!aotImpl) { + if (outErr) *outErr = [NSError errorWithDomain:@"MIL" code:2 + userInfo:@{NSLocalizedDescriptionKey: @"AOT init failed"}]; + return nil; + } + + NSError *plErr = nil; + void *plHandle = ((void*(*)(id,SEL,BOOL,NSError**))objc_msgSend)( + aotImpl, + NSSelectorFromString(@"createProgramLibraryHandleWithRespecialization:error:"), + NO, &plErr); + if (!plHandle) { + printf(" [%s] PL handle failed: %s\n", [label UTF8String], + plErr ? [[plErr description] UTF8String] : "unknown"); + if (outErr) *outErr = plErr; + return nil; + } + + Class plCls = NSClassFromString(@"MLE5ProgramLibrary"); + id progLib = ((id(*)(id,SEL,id,id,id))objc_msgSend)( + [plCls alloc], + NSSelectorFromString(@"initWithImpl:container:configuration:"), + aotImpl, container, cfg); + if (!progLib) { + if (outErr) *outErr = [NSError errorWithDomain:@"MIL" code:4 + userInfo:@{NSLocalizedDescriptionKey: @"ProgramLibrary init failed"}]; + return nil; + } + + Class engCls = NSClassFromString(@"MLE5Engine"); + + // Find the correct init selector + static dispatch_once_t once; + static SEL engInitSel = NULL; + dispatch_once(&once, ^{ + unsigned int mc; + Method *ims = class_copyMethodList(engCls, &mc); + printf(" MLE5Engine init selectors:\n"); + for (unsigned int i = 0; i < mc; i++) { + const char *sel = sel_getName(method_getName(ims[i])); + if (strstr(sel, "init")) { + printf(" - %s [%s]\n", sel, method_getTypeEncoding(ims[i])); + if (strstr(sel, "ProgramLibrary") && strstr(sel, "modelDescription")) + engInitSel = method_getName(ims[i]); + } + } + free(ims); + }); + + if (!engInitSel) { + if (outErr) *outErr = [NSError errorWithDomain:@"MIL" code:5 + userInfo:@{NSLocalizedDescriptionKey: @"No MLE5Engine init selector found"}]; + return nil; + } + + printf(" Using init: %s\n", sel_getName(engInitSel)); + + // Count colons to determine argument count + const char *selName = sel_getName(engInitSel); + int argCount = 0; + for (const char *p = selName; *p; p++) if (*p == ':') argCount++; + + id engine = nil; + if (argCount == 7) { + // initWithProgramLibrary:modelDescription:configuration:functionName: + // classProbabilitiesFeatureName:optionalInputDefaultValues:compilerVersionInfo: + engine = ((id(*)(id,SEL,id,id,id,id,id,id,id))objc_msgSend)( + [engCls alloc], engInitSel, progLib, desc, cfg, + @"main", nil, nil, nil); + } else if (argCount == 5) { + engine = ((id(*)(id,SEL,id,id,id,id,id))objc_msgSend)( + [engCls alloc], engInitSel, progLib, desc, cfg, nil, label); + } else if (argCount == 6) { + engine = ((id(*)(id,SEL,id,id,id,id,id,id))objc_msgSend)( + [engCls alloc], engInitSel, progLib, desc, cfg, nil, nil, label); + } else { + printf(" Unexpected arg count %d for MLE5Engine init\n", argCount); + } + + if (!engine) { + if (outErr) *outErr = [NSError errorWithDomain:@"MIL" code:5 + userInfo:@{NSLocalizedDescriptionKey: @"Engine init failed"}]; + return nil; + } + + NSError *prepErr = nil; + BOOL prepOk = ((BOOL(*)(id,SEL,long long,NSError**))objc_msgSend)( + engine, NSSelectorFromString(@"prepareWithConcurrencyHint:error:"), + (long long)1, &prepErr); + if (!prepOk) { + printf(" [%s] Prepare failed: %s\n", [label UTF8String], + prepErr ? [[prepErr description] UTF8String] : "unknown"); + if (outErr) *outErr = prepErr; + return nil; + } + + return engine; +} + +static id runEngine(id engine, id features, + MLPredictionOptions *opts, NSError **outErr) { + return ((id(*)(id,SEL,id,id,NSError**))objc_msgSend)( + engine, NSSelectorFromString(@"predictionFromFeatures:options:error:"), + features, opts, outErr); +} + +#pragma mark - Numeric Helpers + +static float max_abs_diff(const float *a, const float *b, int n) { + float m = 0; + for (int i = 0; i < n; i++) { + float d = fabsf(a[i] - b[i]); + if (d > m) m = d; + } + return m; +} + +static float mean_abs(const float *a, int n) { + float s = 0; + for (int i = 0; i < n; i++) s += fabsf(a[i]); + return s / n; +} + +static void fill_random(float *buf, int n, float scale) { + for (int i = 0; i < n; i++) + buf[i] = ((float)arc4random() / (float)UINT32_MAX - 0.5f) * 2.0f * scale; +} + +static void print_first(const char *label, const float *buf, int total) { + int n = total < 8 ? total : 8; + printf(" %s: [", label); + for (int i = 0; i < n; i++) + printf("%s%.4f", i ? ", " : "", buf[i]); + printf("]\n"); +} + +#pragma mark - CPU Reference Implementations + +static void cpu_sdpa(const float *Q, const float *K, const float *V, + float *out, int seqLen, int headDim) { + float scale = 1.0f / sqrtf((float)headDim); + float *scores = (float *)calloc(seqLen * seqLen, sizeof(float)); + + for (int i = 0; i < seqLen; i++) { + for (int j = 0; j < seqLen; j++) { + float dot = 0; + for (int d = 0; d < headDim; d++) + dot += Q[i * headDim + d] * K[j * headDim + d]; + scores[i * seqLen + j] = dot * scale; + } + } + for (int i = 0; i < seqLen; i++) { + float maxv = scores[i * seqLen]; + for (int j = 1; j < seqLen; j++) + if (scores[i * seqLen + j] > maxv) maxv = scores[i * seqLen + j]; + float sum = 0; + for (int j = 0; j < seqLen; j++) { + scores[i * seqLen + j] = expf(scores[i * seqLen + j] - maxv); + sum += scores[i * seqLen + j]; + } + for (int j = 0; j < seqLen; j++) + scores[i * seqLen + j] /= sum; + } + for (int i = 0; i < seqLen; i++) { + for (int d = 0; d < headDim; d++) { + float acc = 0; + for (int j = 0; j < seqLen; j++) + acc += scores[i * seqLen + j] * V[j * headDim + d]; + out[i * headDim + d] = acc; + } + } + free(scores); +} + +#pragma mark - Container Discovery + +static id findE5Container(MLModel *model, NSURL *compiledURL, MLModelConfiguration *cfg) { + // Try standard paths first + @try { + id eng = [model valueForKey:@"_internalEngine"]; + if ([NSStringFromClass([eng class]) containsString:@"MLE5"]) { + id pl = [eng valueForKey:@"programLibrary"]; + if (pl) { + id c = nil; + @try { c = [pl valueForKey:@"_container"]; } @catch(id e) { (void)e; } + if (!c) { + @try { + id impl = [pl valueForKey:@"_impl"]; + if (impl) c = [impl valueForKey:@"_container"]; + } @catch(id e) { (void)e; } + } + if (c) return c; + } + } + + // MLMultiFunctionProgramEngine path + if ([NSStringFromClass([eng class]) isEqualToString:@"MLMultiFunctionProgramEngine"]) { + NSDictionary *map = [eng valueForKey:@"_functionNameToEngineMap"]; + for (id key in map) { + id sub = map[key]; + if ([NSStringFromClass([sub class]) containsString:@"MLE5"]) { + id pl = [sub valueForKey:@"programLibrary"]; + if (pl) { + id c = nil; + @try { c = [pl valueForKey:@"_container"]; } @catch(id e) { (void)e; } + if (!c) { + @try { + id impl = [pl valueForKey:@"_impl"]; + if (impl) c = [impl valueForKey:@"_container"]; + } @catch(id e) { (void)e; } + } + if (c) return c; + } + } + } + } + } @catch(id e) { (void)e; } + + // Create MLProgramE5Container directly from compiled model + Class e5Cls = NSClassFromString(@"MLProgramE5Container"); + if (!e5Cls) return nil; + + // Find model.mil path inside the compiled model + NSString *compiledPath = [compiledURL path]; + NSString *milPath = [compiledPath stringByAppendingPathComponent:@"model.mil"]; + if (![[NSFileManager defaultManager] fileExistsAtPath:milPath]) { + printf(" No model.mil at %s\n", [milPath UTF8String]); + + // List contents + NSArray *contents = [[NSFileManager defaultManager] + contentsOfDirectoryAtPath:compiledPath error:nil]; + printf(" Compiled model contents: %s\n", [[contents description] UTF8String]); + } + + // Try to create E5 container with the model asset description from NN container + @try { + id eng = [model valueForKey:@"_internalEngine"]; + id nnContainer = [eng valueForKey:@"_container"]; + if (nnContainer) { + // Get model file path + NSString *modelFilePath = nil; + @try { modelFilePath = [nnContainer valueForKey:@"_modelFilePath"]; } + @catch(id e) { (void)e; } + + if (modelFilePath) { + printf(" Model file path: %s\n", [modelFilePath UTF8String]); + + // Try to create E5 container with this path + @try { + id c = ((id(*)(id,SEL,id,id))objc_msgSend)( + [e5Cls alloc], + NSSelectorFromString(@"initWithModelAssetPath:configuration:"), + modelFilePath, cfg); + if (c) return c; + } @catch(id e) { (void)e; } + } + + // Try initWithModelAssetDescription + @try { + id assetDesc = nil; + @try { assetDesc = [nnContainer valueForKey:@"_modelAssetDescription"]; } + @catch(id e) { (void)e; } + if (!assetDesc) { + @try { assetDesc = [nnContainer valueForKey:@"modelAssetDescription"]; } + @catch(id e) { (void)e; } + } + if (assetDesc) { + printf(" Asset description: %s\n", + [NSStringFromClass([assetDesc class]) UTF8String]); + id c = ((id(*)(id,SEL,id,id))objc_msgSend)( + [e5Cls alloc], + NSSelectorFromString(@"initWithModelAssetDescription:configuration:"), + assetDesc, cfg); + if (c) return c; + } + } @catch(id e) { (void)e; } + } + } @catch(id e) { (void)e; } + + // Dump E5Container init methods + unsigned int mc; + Method *ims = class_copyMethodList(e5Cls, &mc); + printf(" MLProgramE5Container init methods:\n"); + for (unsigned int i = 0; i < mc; i++) { + const char *sel = sel_getName(method_getName(ims[i])); + if (strstr(sel, "init")) + printf(" - %s\n", sel); + } + free(ims); + + return nil; +} + +#pragma mark - Main + +int main(int argc, const char *argv[]) { + (void)argc; (void)argv; + @autoreleasepool { + mach_timebase_info(&g_tb); + printf("================================================================\n"); + printf(" Custom MIL -> ANE: Experiments Y1, Y2, Y3, Z1\n"); + printf("================================================================\n\n"); + + dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/" + "AppleNeuralEngine", RTLD_NOW); + + NSString *pkgPath = @"/tmp/ane_sram_256ch_64sp.mlpackage"; + if (![[NSFileManager defaultManager] fileExistsAtPath:pkgPath]) { + printf("FATAL: %s not found. Run: python3 scripts/gen_mlpackages.py\n", + [pkgPath UTF8String]); + return 1; + } + + NSError *err = nil; + MLModelConfiguration *cfg = [[MLModelConfiguration alloc] init]; + cfg.computeUnits = MLComputeUnitsAll; + MLPredictionOptions *opts = [[MLPredictionOptions alloc] init]; + + NSURL *compiled = [MLModel compileModelAtURL: + [NSURL fileURLWithPath:pkgPath] error:&err]; + if (err) { printf("FATAL: compile: %s\n", [[err description] UTF8String]); return 1; } + + MLModel *refModel = [MLModel modelWithContentsOfURL:compiled + configuration:cfg error:&err]; + if (err) { printf("FATAL: load: %s\n", [[err description] UTF8String]); return 1; } + printf(" Ref model: %s\n", [NSStringFromClass([refModel class]) UTF8String]); + + MLModelDescription *refDesc = [refModel modelDescription]; + + // Find or create E5 container + id refContainer = findE5Container(refModel, compiled, cfg); + if (refContainer) { + printf(" Container: %s\n\n", [NSStringFromClass([refContainer class]) UTF8String]); + } else { + printf(" No E5 container found. Trying nil container...\n\n"); + } + + int ch = 256, sp = 64; + int nElems = ch * sp; + NSString *inName = [[[refDesc inputDescriptionsByName] allKeys] firstObject]; + NSString *outName = [[[refDesc outputDescriptionsByName] allKeys] firstObject]; + printf(" I/O: %s -> %s, shape [1,%d,1,%d]\n\n", [inName UTF8String], + [outName UTF8String], ch, sp); + + // ============================================================ + // Y1: Scaled Dot-Product Attention + // ============================================================ + printf("================================================================\n"); + printf(" Y1: scaled_dot_product_attention on ANE\n"); + printf("================================================================\n\n"); + + { + int seqLen = ch, headDim = sp; + + NSString *sdpaMIL = [NSString stringWithFormat: + @"program(1.3)\n" + "{\n" + " func main(tensor x) {\n" + " string c16 = const()[name = string(\"c16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = c16, x = x)[name = string(\"x16\")];\n" + " tensor sr = const()[name = string(\"sr\"), val = tensor([1, 1, %d, %d])];\n" + " tensor q = reshape(x = x16, shape = sr)[name = string(\"q\")];\n" + " tensor k = reshape(x = x16, shape = sr)[name = string(\"k\")];\n" + " tensor v = reshape(x = x16, shape = sr)[name = string(\"v\")];\n" + " tensor attn = scaled_dot_product_attention(query = q, key = k, value = v)[name = string(\"attn\")];\n" + " tensor or = const()[name = string(\"or\"), val = tensor([1, %d, 1, %d])];\n" + " tensor rs = reshape(x = attn, shape = or)[name = string(\"rs\")];\n" + " string c32 = const()[name = string(\"c32\"), val = string(\"fp32\")];\n" + " tensor cast_out = cast(dtype = c32, x = rs)[name = string(\"cast_out\")];\n" + " } -> (cast_out);\n" + "}\n", + ch, sp, ch, sp, + seqLen, headDim, seqLen, headDim, seqLen, headDim, seqLen, headDim, + seqLen, headDim, + ch, sp, ch, sp, + ch, sp]; + + printf(" Self-attention: B=1, nHeads=1, seqLen=%d, headDim=%d\n\n", seqLen, headDim); + + err = nil; + id engine = compileAndCreateEngine(sdpaMIL, @"y1_sdpa", refContainer, cfg, refDesc, &err); + + if (!engine) { + printf(" Y1 FAILED: %s\n\n", err ? [[err description] UTF8String] : "unknown"); + } else { + printf(" Y1: Engine created\n"); + MLMultiArray *inputArr = [[MLMultiArray alloc] + initWithShape:@[@1, @(ch), @1, @(sp)] + dataType:MLMultiArrayDataTypeFloat32 error:nil]; + float *inPtr = (float *)[inputArr dataPointer]; + fill_random(inPtr, nElems, 0.5f); + + MLDictionaryFeatureProvider *fp = [[MLDictionaryFeatureProvider alloc] + initWithDictionary:@{inName: inputArr} error:nil]; + + NSError *runErr = nil; + uint64_t t0 = mach_absolute_time(); + id result = runEngine(engine, fp, opts, &runErr); + double ms = tb_ms(mach_absolute_time() - t0); + + if (runErr || !result) { + printf(" Y1 prediction FAILED: %s\n\n", + runErr ? [[runErr description] UTF8String] : "nil"); + } else { + MLMultiArray *outArr = [result featureValueForName:outName].multiArrayValue; + if (!outArr) { + printf(" Y1 output nil\n\n"); + } else { + float *outPtr = (float *)[outArr dataPointer]; + print_first("ANE out", outPtr, nElems); + printf(" Time: %.3f ms\n", ms); + + float *cpuOut = (float *)calloc(nElems, sizeof(float)); + cpu_sdpa(inPtr, inPtr, inPtr, cpuOut, seqLen, headDim); + print_first("CPU ref", cpuOut, nElems); + + float mad = max_abs_diff(outPtr, cpuOut, nElems); + printf(" Max diff: %.6f, Rel: %.2e\n", + mad, mad / (mean_abs(cpuOut, nElems) + 1e-10f)); + printf(" %s\n\n", mad < 0.02f ? "*** Y1 PASSED ***" : + (mad < 0.1f ? "Y1 WARNING" : "Y1 FAILED")); + + int N = 100; + t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) runEngine(engine, fp, opts, nil); + printf(" Bench: %.4f ms/eval (%d iters)\n\n", + tb_ms(mach_absolute_time() - t0) / N, N); + free(cpuOut); + } + } + } + } + + // ============================================================ + // Y2: Linear with Embedded Weights + // ============================================================ + printf("================================================================\n"); + printf(" Y2: linear op with embedded weights on ANE\n"); + printf("================================================================\n\n"); + + { + int inDim = sp, outDim = sp; + + float *W = (float *)malloc(outDim * inDim * sizeof(float)); + float *B = (float *)malloc(outDim * sizeof(float)); + fill_random(W, outDim * inDim, 0.1f); + fill_random(B, outDim, 0.01f); + + NSMutableString *wLit = [NSMutableString stringWithString:@"["]; + for (int i = 0; i < outDim; i++) { + if (i > 0) [wLit appendString:@", "]; + [wLit appendString:@"["]; + for (int j = 0; j < inDim; j++) { + if (j > 0) [wLit appendString:@", "]; + [wLit appendFormat:@"%.8e", W[i * inDim + j]]; + } + [wLit appendString:@"]"]; + } + [wLit appendString:@"]"]; + + NSMutableString *bLit = [NSMutableString stringWithString:@"["]; + for (int j = 0; j < outDim; j++) { + if (j > 0) [bLit appendString:@", "]; + [bLit appendFormat:@"%.8e", B[j]]; + } + [bLit appendString:@"]"]; + + NSString *linearMIL = [NSString stringWithFormat: + @"program(1.3)\n" + "{\n" + " func main(tensor x) {\n" + " string c16 = const()[name = string(\"c16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = c16, x = x)[name = string(\"x16\")];\n" + " tensor rs = const()[name = string(\"rs\"), val = tensor([%d, %d])];\n" + " tensor flat = reshape(x = x16, shape = rs)[name = string(\"flat\")];\n" + " tensor Wc = const()[name = string(\"Wc\"), val = tensor(%@)];\n" + " tensor Bc = const()[name = string(\"Bc\"), val = tensor(%@)];\n" + " tensor lin = linear(x = flat, weight = Wc, bias = Bc)[name = string(\"lin\")];\n" + " tensor rs2 = const()[name = string(\"rs2\"), val = tensor([1, %d, 1, %d])];\n" + " tensor rso = reshape(x = lin, shape = rs2)[name = string(\"rso\")];\n" + " string c32 = const()[name = string(\"c32\"), val = string(\"fp32\")];\n" + " tensor cast_out = cast(dtype = c32, x = rso)[name = string(\"cast_out\")];\n" + " } -> (cast_out);\n" + "}\n", + ch, sp, ch, sp, + ch, sp, ch, sp, + outDim, inDim, outDim, inDim, wLit, + outDim, outDim, bLit, + ch, outDim, + ch, sp, ch, sp, + ch, sp]; + + printf(" Config: [%d,%d] linear %d->%d with embedded W+b\n\n", ch, sp, inDim, outDim); + + err = nil; + id engine = compileAndCreateEngine(linearMIL, @"y2_linear", refContainer, cfg, refDesc, &err); + + if (!engine) { + printf(" Y2 FAILED: %s\n\n", err ? [[err description] UTF8String] : "unknown"); + } else { + printf(" Y2: Engine created\n"); + MLMultiArray *inputArr = [[MLMultiArray alloc] + initWithShape:@[@1, @(ch), @1, @(sp)] + dataType:MLMultiArrayDataTypeFloat32 error:nil]; + float *inPtr = (float *)[inputArr dataPointer]; + fill_random(inPtr, nElems, 0.5f); + + MLDictionaryFeatureProvider *fp = [[MLDictionaryFeatureProvider alloc] + initWithDictionary:@{inName: inputArr} error:nil]; + + NSError *runErr = nil; + uint64_t t0 = mach_absolute_time(); + id result = runEngine(engine, fp, opts, &runErr); + double ms = tb_ms(mach_absolute_time() - t0); + + if (runErr || !result) { + printf(" Y2 prediction FAILED: %s\n\n", + runErr ? [[runErr description] UTF8String] : "nil"); + } else { + MLMultiArray *outArr = [result featureValueForName:outName].multiArrayValue; + if (outArr) { + float *outPtr = (float *)[outArr dataPointer]; + print_first("ANE out", outPtr, nElems); + printf(" Time: %.3f ms\n", ms); + + // CPU: x[ch,sp] @ W^T[sp,sp] + b[sp] + float *cpuOut = (float *)calloc(nElems, sizeof(float)); + for (int i = 0; i < ch; i++) { + for (int j = 0; j < outDim; j++) { + float acc = 0; + for (int k = 0; k < inDim; k++) + acc += inPtr[i * inDim + k] * W[j * inDim + k]; + cpuOut[i * outDim + j] = acc + B[j]; + } + } + print_first("CPU ref", cpuOut, nElems); + + float mad = max_abs_diff(outPtr, cpuOut, nElems); + printf(" Max diff: %.6f, Rel: %.2e\n", + mad, mad / (mean_abs(cpuOut, nElems) + 1e-10f)); + printf(" %s\n\n", mad < 0.05f ? "*** Y2 PASSED ***" : + (mad < 0.5f ? "Y2 WARNING" : "Y2 FAILED")); + + int N = 100; + t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) runEngine(engine, fp, opts, nil); + printf(" Bench: %.4f ms/eval (%d iters)\n\n", + tb_ms(mach_absolute_time() - t0) / N, N); + free(cpuOut); + } + } + } + free(W); free(B); + } + + // ============================================================ + // Y3: Transformer Block (Attention + FFN) + // ============================================================ + printf("================================================================\n"); + printf(" Y3: Transformer Block (LN + SDPA + Residual + LN + FFN + Residual)\n"); + printf("================================================================\n\n"); + + { + int seqLen = ch, dim = sp, ffnDim = 128; + + float *w1 = (float *)malloc(ffnDim * dim * sizeof(float)); + float *b1 = (float *)malloc(ffnDim * sizeof(float)); + float *w2 = (float *)malloc(dim * ffnDim * sizeof(float)); + float *b2 = (float *)malloc(dim * sizeof(float)); + fill_random(w1, ffnDim * dim, 0.05f); + fill_random(b1, ffnDim, 0.01f); + fill_random(w2, dim * ffnDim, 0.05f); + fill_random(b2, dim, 0.01f); + + // Build weight string literals + NSMutableString *(^buildMat)(float*, int, int) = ^(float *m, int rows, int cols) { + NSMutableString *s = [NSMutableString stringWithString:@"["]; + for (int i = 0; i < rows; i++) { + if (i > 0) [s appendString:@", "]; + [s appendString:@"["]; + for (int j = 0; j < cols; j++) { + if (j > 0) [s appendString:@", "]; + [s appendFormat:@"%.8e", m[i * cols + j]]; + } + [s appendString:@"]"]; + } + [s appendString:@"]"]; + return s; + }; + + NSMutableString *(^buildVec)(float*, int) = ^(float *v, int n) { + NSMutableString *s = [NSMutableString stringWithString:@"["]; + for (int i = 0; i < n; i++) { + if (i > 0) [s appendString:@", "]; + [s appendFormat:@"%.8e", v[i]]; + } + [s appendString:@"]"]; + return s; + }; + + NSMutableString *(^buildOnes)(int) = ^(int n) { + NSMutableString *s = [NSMutableString stringWithString:@"["]; + for (int i = 0; i < n; i++) { + if (i > 0) [s appendString:@", "]; + [s appendString:@"1.0"]; + } + [s appendString:@"]"]; + return s; + }; + + NSMutableString *(^buildZeros)(int) = ^(int n) { + NSMutableString *s = [NSMutableString stringWithString:@"["]; + for (int i = 0; i < n; i++) { + if (i > 0) [s appendString:@", "]; + [s appendString:@"0.0"]; + } + [s appendString:@"]"]; + return s; + }; + + NSString *tfMIL = [NSString stringWithFormat: + @"program(1.3)\n" + "{\n" + " func main(tensor x) {\n" + " string c16 = const()[name = string(\"c16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = c16, x = x)[name = string(\"x16\")];\n" + " tensor r2 = const()[name = string(\"r2\"), val = tensor([%d, %d])];\n" + " tensor flat = reshape(x = x16, shape = r2)[name = string(\"flat\")];\n" + // LN1 + " tensor g1 = const()[name = string(\"g1\"), val = tensor(%@)];\n" + " tensor b1 = const()[name = string(\"b1\"), val = tensor(%@)];\n" + " tensor la = const()[name = string(\"la\"), val = tensor([-1])];\n" + " fp16 eps = const()[name = string(\"eps\"), val = fp16(1e-5)];\n" + " tensor ln1 = layer_norm(x = flat, axes = la, gamma = g1, beta = b1, epsilon = eps)[name = string(\"ln1\")];\n" + // SDPA + " tensor sr = const()[name = string(\"sr\"), val = tensor([1, 1, %d, %d])];\n" + " tensor q = reshape(x = ln1, shape = sr)[name = string(\"q\")];\n" + " tensor k = reshape(x = ln1, shape = sr)[name = string(\"k\")];\n" + " tensor v = reshape(x = ln1, shape = sr)[name = string(\"v\")];\n" + " tensor at = scaled_dot_product_attention(query = q, key = k, value = v)[name = string(\"at\")];\n" + " tensor af = reshape(x = at, shape = r2)[name = string(\"af\")];\n" + // Residual 1 + " tensor r1 = add(x = flat, y = af)[name = string(\"r1\")];\n" + // LN2 + " tensor g2 = const()[name = string(\"g2\"), val = tensor(%@)];\n" + " tensor b2 = const()[name = string(\"b2\"), val = tensor(%@)];\n" + " tensor ln2 = layer_norm(x = r1, axes = la, gamma = g2, beta = b2, epsilon = eps)[name = string(\"ln2\")];\n" + // FFN + " tensor W1 = const()[name = string(\"W1\"), val = tensor(%@)];\n" + " tensor B1 = const()[name = string(\"B1\"), val = tensor(%@)];\n" + " tensor f1 = linear(x = ln2, weight = W1, bias = B1)[name = string(\"f1\")];\n" + " tensor ga = gelu(x = f1, mode = string(\"TANH_APPROXIMATION\"))[name = string(\"ga\")];\n" + " tensor W2 = const()[name = string(\"W2\"), val = tensor(%@)];\n" + " tensor B2 = const()[name = string(\"B2\"), val = tensor(%@)];\n" + " tensor f2 = linear(x = ga, weight = W2, bias = B2)[name = string(\"f2\")];\n" + // Residual 2 + " tensor r2o = add(x = r1, y = f2)[name = string(\"r2o\")];\n" + // Output + " tensor r4 = const()[name = string(\"r4\"), val = tensor([1, %d, 1, %d])];\n" + " tensor o16 = reshape(x = r2o, shape = r4)[name = string(\"o16\")];\n" + " string c32 = const()[name = string(\"c32\"), val = string(\"fp32\")];\n" + " tensor cast_out = cast(dtype = c32, x = o16)[name = string(\"cast_out\")];\n" + " } -> (cast_out);\n" + "}\n", + ch, sp, ch, sp, + seqLen, dim, seqLen, dim, + dim, dim, buildOnes(dim), + dim, dim, buildZeros(dim), + seqLen, dim, + seqLen, dim, seqLen, dim, seqLen, dim, seqLen, dim, + seqLen, dim, + seqLen, dim, + seqLen, dim, + dim, dim, buildOnes(dim), + dim, dim, buildZeros(dim), + seqLen, dim, + ffnDim, dim, ffnDim, dim, buildMat(w1, ffnDim, dim), + ffnDim, ffnDim, buildVec(b1, ffnDim), + seqLen, ffnDim, + seqLen, ffnDim, + dim, ffnDim, dim, ffnDim, buildMat(w2, dim, ffnDim), + dim, dim, buildVec(b2, dim), + seqLen, dim, + seqLen, dim, + ch, sp, ch, sp, + ch, sp]; + + printf(" Pipeline: LN->SDPA->Res->LN->FFN(%d->%d->%d)->Res\n\n", dim, ffnDim, dim); + + err = nil; + id engine = compileAndCreateEngine(tfMIL, @"y3_transformer", + refContainer, cfg, refDesc, &err); + + if (!engine) { + printf(" Y3 FAILED: %s\n\n", err ? [[err description] UTF8String] : "unknown"); + } else { + printf(" Y3: Engine created!\n"); + MLMultiArray *inputArr = [[MLMultiArray alloc] + initWithShape:@[@1, @(ch), @1, @(sp)] + dataType:MLMultiArrayDataTypeFloat32 error:nil]; + float *inPtr = (float *)[inputArr dataPointer]; + fill_random(inPtr, nElems, 0.5f); + + MLDictionaryFeatureProvider *fp = [[MLDictionaryFeatureProvider alloc] + initWithDictionary:@{inName: inputArr} error:nil]; + + NSError *runErr = nil; + uint64_t t0 = mach_absolute_time(); + id result = runEngine(engine, fp, opts, &runErr); + double ms = tb_ms(mach_absolute_time() - t0); + + if (runErr || !result) { + printf(" Y3 prediction FAILED: %s\n\n", + runErr ? [[runErr description] UTF8String] : "nil"); + } else { + MLMultiArray *outArr = [result featureValueForName:outName].multiArrayValue; + if (outArr) { + float *outPtr = (float *)[outArr dataPointer]; + print_first("ANE out", outPtr, nElems); + printf(" Time: %.3f ms\n", ms); + float m = mean_abs(outPtr, nElems); + printf(" Non-zero: %s (mean_abs=%.6f)\n", m > 1e-6f ? "YES" : "NO", m); + printf(" %s\n\n", m > 1e-6f ? "*** Y3 PASSED ***" : "Y3 FAILED"); + + int N = 100; + t0 = mach_absolute_time(); + for (int i = 0; i < N; i++) runEngine(engine, fp, opts, nil); + printf(" Bench: %.4f ms/eval (%d iters)\n\n", + tb_ms(mach_absolute_time() - t0) / N, N); + } + } + } + free(w1); free(b1); free(w2); free(b2); + } + + // ============================================================ + // Z1: Linear Backward Pass (Gradient Computation) + // ============================================================ + printf("================================================================\n"); + printf(" Z1: Backward Pass (matmul with runtime tensors) on ANE\n"); + printf("================================================================\n\n"); + + { + int M = 128, K = 64, N = 64; + + NSString *bwdMIL = [NSString stringWithFormat: + @"program(1.3)\n" + "{\n" + " func main(tensor x) {\n" + " string c16 = const()[name = string(\"c16\"), val = string(\"fp16\")];\n" + " tensor x16 = cast(dtype = c16, x = x)[name = string(\"x16\")];\n" + " tensor r2 = const()[name = string(\"r2\"), val = tensor([%d, %d])];\n" + " tensor flat = reshape(x = x16, shape = r2)[name = string(\"flat\")];\n" + // Slice dY [0:128, :] + " tensor db = const()[name = string(\"db\"), val = tensor([0, 0])];\n" + " tensor de = const()[name = string(\"de\"), val = tensor([%d, %d])];\n" + " tensor dY = slice_by_index(x = flat, begin = db, end = de)[name = string(\"dY\")];\n" + // Slice W [128:192, :] + " tensor wb = const()[name = string(\"wb\"), val = tensor([%d, 0])];\n" + " tensor we = const()[name = string(\"we\"), val = tensor([%d, %d])];\n" + " tensor W = slice_by_index(x = flat, begin = wb, end = we)[name = string(\"W\")];\n" + // Slice pad [192:256, :] + " tensor pb = const()[name = string(\"pb\"), val = tensor([%d, 0])];\n" + " tensor pe = const()[name = string(\"pe\"), val = tensor([%d, %d])];\n" + " tensor pad = slice_by_index(x = flat, begin = pb, end = pe)[name = string(\"pad\")];\n" + // dX = dY @ W + " bool txf = const()[name = string(\"txf\"), val = bool(false)];\n" + " bool tyf = const()[name = string(\"tyf\"), val = bool(false)];\n" + " bool txt = const()[name = string(\"txt\"), val = bool(true)];\n" + " tensor dX = matmul(x = dY, y = W, transpose_x = txf, transpose_y = tyf)[name = string(\"dX\")];\n" + // dW = dY^T @ dY + " tensor dW = matmul(x = dY, y = dY, transpose_x = txt, transpose_y = tyf)[name = string(\"dW\")];\n" + // Concat [dX, dW, pad] + " int32 ax = const()[name = string(\"ax\"), val = int32(0)];\n" + " bool il = const()[name = string(\"il\"), val = bool(false)];\n" + " tensor pk = concat(values = (dX, dW, pad), axis = ax, interleave = il)[name = string(\"pk\")];\n" + " tensor r4 = const()[name = string(\"r4\"), val = tensor([1, %d, 1, %d])];\n" + " tensor o16 = reshape(x = pk, shape = r4)[name = string(\"o16\")];\n" + " string c32 = const()[name = string(\"c32\"), val = string(\"fp32\")];\n" + " tensor cast_out = cast(dtype = c32, x = o16)[name = string(\"cast_out\")];\n" + " } -> (cast_out);\n" + "}\n", + ch, sp, ch, sp, + ch, sp, ch, sp, + M, K, M, K, + M, M + K, K, K, K, + M + K, ch, sp, ch - M - K, sp, + M, N, + K, K, + ch, sp, + ch, sp, ch, sp, + ch, sp]; + + printf(" dX = dY[%d,%d] @ W[%d,%d] -> [%d,%d]\n", M, K, K, N, M, N); + printf(" dW = dY^T @ dY -> [%d,%d]\n\n", K, K); + + err = nil; + id engine = compileAndCreateEngine(bwdMIL, @"z1_backward", + refContainer, cfg, refDesc, &err); + + if (!engine) { + printf(" Z1 FAILED: %s\n\n", err ? [[err description] UTF8String] : "unknown"); + } else { + printf(" Z1: Engine created\n"); + MLMultiArray *inputArr = [[MLMultiArray alloc] + initWithShape:@[@1, @(ch), @1, @(sp)] + dataType:MLMultiArrayDataTypeFloat32 error:nil]; + float *inPtr = (float *)[inputArr dataPointer]; + fill_random(inPtr, nElems, 0.3f); + + MLDictionaryFeatureProvider *fp = [[MLDictionaryFeatureProvider alloc] + initWithDictionary:@{inName: inputArr} error:nil]; + + NSError *runErr = nil; + uint64_t t0 = mach_absolute_time(); + id result = runEngine(engine, fp, opts, &runErr); + double ms = tb_ms(mach_absolute_time() - t0); + + if (runErr || !result) { + printf(" Z1 prediction FAILED: %s\n\n", + runErr ? [[runErr description] UTF8String] : "nil"); + } else { + MLMultiArray *outArr = [result featureValueForName:outName].multiArrayValue; + if (outArr) { + float *outPtr = (float *)[outArr dataPointer]; + + // CPU: dX = dY @ W + float *dY_cpu = inPtr; + float *W_cpu = inPtr + M * K; + float *dX_cpu = (float *)calloc(M * N, sizeof(float)); + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) { + float a = 0; + for (int k = 0; k < K; k++) + a += dY_cpu[i*K+k] * W_cpu[k*N+j]; + dX_cpu[i*N+j] = a; + } + + // CPU: dW = dY^T @ dY + float *dW_cpu = (float *)calloc(K * K, sizeof(float)); + for (int i = 0; i < K; i++) + for (int j = 0; j < K; j++) { + float a = 0; + for (int m = 0; m < M; m++) + a += dY_cpu[m*K+i] * dY_cpu[m*K+j]; + dW_cpu[i*K+j] = a; + } + + print_first("ANE dX", outPtr, M * N); + print_first("CPU dX", dX_cpu, M * N); + float mad_dx = max_abs_diff(outPtr, dX_cpu, M * N); + printf(" dX diff: %.6f, Rel: %.2e\n", + mad_dx, mad_dx / (mean_abs(dX_cpu, M*N) + 1e-10f)); + + print_first("ANE dW", outPtr + M*N, K*K); + print_first("CPU dW", dW_cpu, K*K); + float mad_dw = max_abs_diff(outPtr + M*N, dW_cpu, K * K); + printf(" dW diff: %.6f, Rel: %.2e\n", + mad_dw, mad_dw / (mean_abs(dW_cpu, K*K) + 1e-10f)); + printf(" Time: %.3f ms\n", ms); + printf(" %s\n\n", + (mad_dx < 0.5f && mad_dw < 1.0f) + ? "*** Z1 PASSED ***" : "Z1: differences (fp16 precision)"); + + int NN = 100; + t0 = mach_absolute_time(); + for (int i = 0; i < NN; i++) runEngine(engine, fp, opts, nil); + printf(" Bench: %.4f ms/eval (%d iters)\n\n", + tb_ms(mach_absolute_time() - t0) / NN, NN); + + free(dX_cpu); free(dW_cpu); + } + } + } + } + + printf("================================================================\n"); + printf(" DONE\n"); + printf("================================================================\n"); + } + return 0; +} diff --git a/training/test_throughput_ceiling.m b/training/test_throughput_ceiling.m new file mode 100644 index 0000000..1401041 --- /dev/null +++ b/training/test_throughput_ceiling.m @@ -0,0 +1,238 @@ +// test_throughput_ceiling.m — Experiment I: Multi-kernel throughput ceiling +// Measures CPU round-trip overhead for sequential ANE kernel execution +// Build: make test_throughput_ceiling && ./test_throughput_ceiling +#import +#import +#include +#include "ane_runtime.h" + +static int g_fp16_io = 1; + +static NSString *gen_conv_mil_fp16(int ch, int sp) { + return [NSString stringWithFormat: + @"program(1.0)\n[buildInfo = dict, tensor>" + "({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n" + " func main(tensor x) {\n" + " tensor pt = const()[name=tensor(\"pt\")," + " val=tensor(\"valid\")];\n" + " tensor st = const()[name=tensor(\"st\")," + " val=tensor([1,1])];\n" + " tensor pd = const()[name=tensor(\"pd\")," + " val=tensor([0,0,0,0])];\n" + " tensor dl = const()[name=tensor(\"dl\")," + " val=tensor([1,1])];\n" + " tensor gr = const()[name=tensor(\"gr\")," + " val=tensor(1)];\n" + " tensor W = const()[name=tensor(\"W\"), " + "val=tensor(BLOBFILE(path=tensor" + "(\"@model_path/weights/weight.bin\"), offset=tensor(64)))];\n" + " tensor y = conv(dilations=dl,groups=gr," + "pad=pd,pad_type=pt,strides=st,weight=W,x=x)" + "[name=tensor(\"conv\")];\n" + " } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp]; +} + +static ANEKernel *compile_fp16_kernel(int ch, int sp) { + int ws = ch * ch * 2; + int tot = 128 + ws; + uint8_t *blob = (uint8_t *)calloc((size_t)tot, 1); + blob[0] = 1; blob[4] = 2; + blob[64] = 0xEF; blob[65] = 0xBE; blob[66] = 0xAD; blob[67] = 0xDE; + blob[68] = 1; + *(uint32_t *)(blob + 72) = (uint32_t)ws; + *(uint32_t *)(blob + 80) = 128; + _Float16 *wp = (_Float16 *)(blob + 128); + for (int i = 0; i < ch; i++) wp[i * ch + i] = (_Float16)1.0f; + NSData *wdata = [NSData dataWithBytesNoCopy:blob length:(NSUInteger)tot + freeWhenDone:YES]; + + NSString *mil = gen_conv_mil_fp16(ch, sp); + NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding]; + size_t ioBytes = (size_t)ch * sp * 2; + return ane_compile(md, wdata, 1, &ioBytes, 1, &ioBytes); +} + +int main(int argc, const char *argv[]) { + (void)argc; (void)argv; + @autoreleasepool { + mach_timebase_info_data_t tb; + mach_timebase_info(&tb); + + printf("============================================================\n"); + printf(" Experiment I: Multi-Kernel Throughput Ceiling\n"); + printf(" Measuring CPU round-trip overhead for sequential ANE ops\n"); + printf("============================================================\n\n"); + + ane_init(); + if (!g_ane_ok) { printf("ANE not available\n"); return 1; } + + typedef struct { int ch; int sp; const char *name; } Config; + Config configs[] = { + {64, 32, "64x32 (test)"}, + {256, 64, "256x64 (small)"}, + {768, 256, "768x256 (prod)"}, + }; + int nconfigs = sizeof(configs) / sizeof(configs[0]); + + for (int ci = 0; ci < nconfigs; ci++) { + Config cfg = configs[ci]; + printf("=== Config: %s ===\n", cfg.name); + + int nlayers = 12; + ANEKernel *kernels[12]; + int compiled = 0; + for (int i = 0; i < nlayers; i++) { + @try { + kernels[i] = compile_fp16_kernel(cfg.ch, cfg.sp); + if (!kernels[i]) { + printf(" Kernel %d compile failed\n", i); + break; + } + compiled++; + } @catch (NSException *ex) { + printf(" Kernel %d exception: %s\n", i, + [[ex reason] UTF8String]); + break; + } + } + printf(" Compiled %d/%d kernels\n", compiled, nlayers); + if (compiled < 2) { + printf(" Need at least 2 kernels, skipping\n\n"); + for (int i = 0; i < compiled; i++) ane_free(kernels[i]); + continue; + } + + size_t ioBytes = (size_t)cfg.ch * cfg.sp * 2; + int warmup = 5; + int iters = 50; + + // --- Test 1: Sequential (run + memcpy chain) --- + printf("\n --- Test 1: Sequential (run + memcpy) ---\n"); + { + for (int w = 0; w < warmup; w++) { + @try { + for (int i = 0; i < compiled; i++) + ane_eval(kernels[i]); + } @catch (NSException *ex) { (void)ex; } + } + + uint64_t t0 = mach_absolute_time(); + for (int it = 0; it < iters; it++) { + for (int i = 0; i < compiled - 1; i++) { + @try { + ane_eval(kernels[i]); + IOSurfaceLock(kernels[i]->ioOutputs[0], + kIOSurfaceLockReadOnly, NULL); + IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL); + memcpy( + IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]), + IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]), + ioBytes); + IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL); + IOSurfaceUnlock(kernels[i]->ioOutputs[0], + kIOSurfaceLockReadOnly, NULL); + } @catch (NSException *ex) { (void)ex; } + } + @try { + ane_eval(kernels[compiled - 1]); + } @catch (NSException *ex) { (void)ex; } + } + double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; + double perIter = totalMs / iters; + double perKernel = perIter / compiled; + printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled); + printf(" Per kernel: %.3f ms\n", perKernel); + printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter); + } + + // --- Test 2: Run-only (no memcpy, pure ANE overhead) --- + printf("\n --- Test 2: Run-only (no memcpy between) ---\n"); + { + uint64_t t0 = mach_absolute_time(); + for (int it = 0; it < iters; it++) { + for (int i = 0; i < compiled; i++) { + @try { + ane_eval(kernels[i]); + } @catch (NSException *ex) { (void)ex; } + } + } + double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; + double perIter = totalMs / iters; + double perKernel = perIter / compiled; + printf(" Total: %.2f ms/pass (%d kernels)\n", perIter, compiled); + printf(" Per kernel: %.3f ms\n", perKernel); + printf(" Throughput: %.0f kernels/s\n", compiled * 1000.0 / perIter); + } + + // --- Test 3: Memcpy-only overhead --- + printf("\n --- Test 3: Memcpy-only overhead ---\n"); + { + uint64_t t0 = mach_absolute_time(); + for (int it = 0; it < iters * 10; it++) { + for (int i = 0; i < compiled - 1; i++) { + IOSurfaceLock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL); + IOSurfaceLock(kernels[i+1]->ioInputs[0], 0, NULL); + memcpy( + IOSurfaceGetBaseAddress(kernels[i+1]->ioInputs[0]), + IOSurfaceGetBaseAddress(kernels[i]->ioOutputs[0]), + ioBytes); + IOSurfaceUnlock(kernels[i+1]->ioInputs[0], 0, NULL); + IOSurfaceUnlock(kernels[i]->ioOutputs[0], kIOSurfaceLockReadOnly, NULL); + } + } + double totalMs = (double)(mach_absolute_time() - t0) * tb.numer / tb.denom / 1e6; + double perIter = totalMs / (iters * 10); + double perCopy = perIter / (compiled - 1); + printf(" Total: %.3f ms/pass (%d copies)\n", perIter, compiled - 1); + printf(" Per memcpy: %.4f ms (%lu bytes)\n", perCopy, (unsigned long)ioBytes); + } + + // --- Test 4: GCD serial queue --- + printf("\n --- Test 4: GCD serial queue ---\n"); + { + ANEKernel **kptrs = (ANEKernel **)malloc( + (size_t)compiled * sizeof(ANEKernel *)); + for (int i = 0; i < compiled; i++) kptrs[i] = kernels[i]; + + dispatch_queue_t q = dispatch_queue_create( + "ane.throughput", DISPATCH_QUEUE_SERIAL); + dispatch_semaphore_t sem = dispatch_semaphore_create(0); + const int ncomp = compiled; + + uint64_t t0 = mach_absolute_time(); + for (int it = 0; it < iters; it++) { + __block int done = 0; + for (int i = 0; i < ncomp; i++) { + ANEKernel *kp = kptrs[i]; + dispatch_async(q, ^{ + @try { + ane_eval(kp); + } @catch (NSException *ex) { (void)ex; } + done++; + if (done == ncomp) + dispatch_semaphore_signal(sem); + }); + } + dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER); + } + double totalMs = (double)(mach_absolute_time() - t0) + * tb.numer / tb.denom / 1e6; + double perIter = totalMs / iters; + printf(" Total: %.2f ms/pass (%d kernels, serial queue)\n", + perIter, ncomp); + printf(" Per kernel: %.3f ms\n", perIter / ncomp); + free(kptrs); + } + + printf("\n --- CPU Round-trip Overhead ---\n"); + printf(" Overhead = (Sequential - RunOnly) / %d copies\n", compiled - 1); + printf(" This is what chaining would eliminate per layer.\n"); + + for (int i = 0; i < compiled; i++) ane_free(kernels[i]); + printf("\n"); + } + + printf("Done.\n"); + } + return 0; +}