From a9da4f10911aea1d1bc703db62047707a0d1a1d5 Mon Sep 17 00:00:00 2001 From: qukelin Date: Fri, 13 Mar 2026 20:50:17 +0800 Subject: [PATCH 1/4] feat(pto): add explicit tfree pipe interface --- .../2026-03-13-tpop-pipe-interface-spec-zh.md | 640 ++++++++++++++++++ include/PTO/IR/PTOOps.td | 218 ++++++ include/PTO/IR/PTOTypeDefs.td | 12 + include/PTO/Transforms/Passes.h | 4 +- include/PTO/Transforms/Passes.td | 34 + lib/PTO/IR/PTO.cpp | 219 ++++++ lib/PTO/Transforms/CMakeLists.txt | 2 + lib/PTO/Transforms/PTOLowerTPopPass.cpp | 170 +++++ lib/PTO/Transforms/PTOPlanMemory.cpp | 5 + lib/PTO/Transforms/PTOToEmitC.cpp | 469 ++++++++++++- lib/PTO/Transforms/PTOVerifyTFreePass.cpp | 145 ++++ test/basic/get_fifo_tile_verify.mlir | 25 + .../initialize_pipe_verify_gm_missing.mlir | 11 + .../initialize_pipe_verify_gm_wrongtype.mlir | 12 + ...nitialize_pipe_verify_local_wrongtype.mlir | 13 + test/basic/tfree_after_if.mlir | 49 ++ test/basic/tfree_emitc_a2a3.mlir | 34 + test/basic/tfree_emitc_a5.mlir | 67 ++ test/basic/tfree_missing.mlir | 39 ++ test/basic/tfree_multiple_outstanding.mlir | 47 ++ test/basic/tfree_ops.mlir | 33 + test/basic/tfree_use_after_free.mlir | 34 + test/basic/tfree_verify.mlir | 17 + test/basic/tpush_tpop_fifo_emitc.mlir | 48 ++ .../tpush_tpop_fifo_emitc_a5_dirmask.mlir | 61 ++ test/basic/tpush_tpop_fifo_ops.mlir | 46 ++ test/basic/tpush_tpop_fifo_verify.mlir | 18 + test/tmp_tpush_tpop_matmul_pipe_a5/README.md | 47 ++ .../tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir | 64 ++ .../kernel_a5_manual.cpp | 78 +++ test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp | 70 ++ tools/ptoas/ptoas.cpp | 10 +- 32 files changed, 2720 insertions(+), 21 deletions(-) create mode 100644 docs/2026-03-13-tpop-pipe-interface-spec-zh.md create mode 100644 lib/PTO/Transforms/PTOLowerTPopPass.cpp create mode 100644 lib/PTO/Transforms/PTOVerifyTFreePass.cpp create mode 100644 test/basic/get_fifo_tile_verify.mlir create mode 100644 test/basic/initialize_pipe_verify_gm_missing.mlir create mode 100644 test/basic/initialize_pipe_verify_gm_wrongtype.mlir create mode 100644 test/basic/initialize_pipe_verify_local_wrongtype.mlir create mode 100644 test/basic/tfree_after_if.mlir create mode 100644 test/basic/tfree_emitc_a2a3.mlir create mode 100644 test/basic/tfree_emitc_a5.mlir create mode 100644 test/basic/tfree_missing.mlir create mode 100644 test/basic/tfree_multiple_outstanding.mlir create mode 100644 test/basic/tfree_ops.mlir create mode 100644 test/basic/tfree_use_after_free.mlir create mode 100644 test/basic/tfree_verify.mlir create mode 100644 test/basic/tpush_tpop_fifo_emitc.mlir create mode 100644 test/basic/tpush_tpop_fifo_emitc_a5_dirmask.mlir create mode 100644 test/basic/tpush_tpop_fifo_ops.mlir create mode 100644 test/basic/tpush_tpop_fifo_verify.mlir create mode 100644 test/tmp_tpush_tpop_matmul_pipe_a5/README.md create mode 100644 test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir create mode 100644 test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp create mode 100644 test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp diff --git a/docs/2026-03-13-tpop-pipe-interface-spec-zh.md b/docs/2026-03-13-tpop-pipe-interface-spec-zh.md new file mode 100644 index 00000000..ceecd574 --- /dev/null +++ b/docs/2026-03-13-tpop-pipe-interface-spec-zh.md @@ -0,0 +1,640 @@ +# PTOAS Pipe & TPUSH/TPOP 接口定义 + +--- + +## 1. 概述 + +PTOAS 提供跨 Cube/Vector 单元的 pipe 通信机制,用于 Cube 和 Vector 之间的 tile 数据传递。根据数据流转路径,定义两种 pipe 初始化指令: + +| 指令 | 数据路径 | 含义 | +|---|---|---| +| `pto.initialize_l2g2l_pipe` | local → GM → local | 数据经 GM 中转,适用于 A3 和 A5 | +| `pto.initialize_l2l_pipe` | local → local | 数据在 local buffer 内直传,仅 A5 | + +两种 pipe 共享统一的操作接口——生产者:`pto.tpush`;消费者:`pto.tpop` → `pto.get_fifo_tile` → `pto.tfree`。 + +其中,`pto.tfree(%pipe, %slot_id)` 是**必写**的资源释放操作: + +- `pto.tpop` 借出一个 slot +- `pto.get_fifo_tile` 暴露该 slot 对应的 tile 视图 +- 用户在 tile 使用结束后,必须显式写出对应的 `pto.tfree` +- 编译器会对这组接口的配对关系和使用顺序做 verify 检查 + +### 编译变换过程 + +``` +前端 IR Lowered IR EmitC +──────── ────────── ───── +pto.initialize_l2g2l_pipe(...) ─→ ─→ TPipe<..., GM_FIFO, ...>(gm, local) +pto.initialize_l2l_pipe(...) ─→ ─→ TPipe<..., VEC/MAT_FIFO, ...>(local) + +pto.tpush(%tile, %pipe) ─→ ─→ TPUSH(tile, pipe) + +pto.tpop(%pipe) → %slot_id ┐ + ├→ pto.tpop_internal(%tile, %pipe) → TPOP(tile, pipe) +pto.get_fifo_tile(%pipe, %sid) ┘ [DPS] + +pto.tfree(%pipe, %slot_id) ─→ pto.tfree_internal(%pipe) ─→ TFREE(pipe) +``` + +--- + +## 2. Pipe 初始化指令 + +### 2.1 `pto.initialize_l2g2l_pipe` + +创建经 GM 中转的 pipe。生产者将 tile 写入 GM,消费者 TPOP 时从 GM 搬运到 local FIFO slot,再暴露给用户。 + +**数据路径:** `local(producer) → GM FIFO → local FIFO(consumer)` + +**语法:** + +```mlir +%pipe = pto.initialize_l2g2l_pipe { + dir_mask = , + local_fifo_depth = // 可选,默认 2 +} + ( : memref<..., #pto.address_space> + [, : i32] ) + -> !pto.pipe +``` + +**参数:** + +| 参数 | 类型 | 说明 | +|---|---|---| +| `dir_mask` | `i8`(属性) | 方向:1 = C2V(Cube→Vector),2 = V2C(Vector→Cube) | +| `local_fifo_depth` | `i8`(属性,可选) | local FIFO 的 slot 深度,默认 2(double-buffering) | +| `gm_addr` | `memref<..., #pto.address_space>`(操作数,必须) | GM FIFO 基地址 | +| `local_addr` | `i32`(操作数,可选) | local FIFO 基地址。省略时由 plan memory 分配 | + +**`local_fifo_depth` 说明:** +- 指定 TPOP 从 GM 搬运数据到 local buffer 时,local FIFO 的 slot 深度 +- plan memory / analysis pass 根据此值计算 local buffer 分配量(= `local_fifo_depth × slot_size`) +- `slot_size = max(srcTile.size, dstTile.size)`(见第 9 节) +- 默认值 2,支持 double-buffering + +**结果:** `!pto.pipe` + +**校验规则:** +- `dir_mask` 必须为 1 或 2 +- `local_fifo_depth` 如指定须大于 0 +- `gm_addr` 必须带 GM address space +- `local_addr` 如存在须为 `i32` + +**可用架构:** A3、A5 + +**示例:** + +```mlir +// 指定 GM 地址和 local 地址 +%pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_addr : memref<64x128xf32, #pto.address_space>, %local_addr : i32) + -> !pto.pipe>, + memref<32x128xf32, #pto.address_space>> + +// 省略 local 地址,由 plan memory 分配 +%pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_addr : memref<64x128xf32, #pto.address_space>) + -> !pto.pipe>, + memref<32x128xf32, #pto.address_space>> + +// 指定 local_fifo_depth=4 +%pipe = pto.initialize_l2g2l_pipe {dir_mask = 1, local_fifo_depth = 4} + (%gm_addr : memref<64x128xf32, #pto.address_space>, %local_addr : i32) + -> !pto.pipe>, + memref<32x128xf32, #pto.address_space>> +``` + +--- + +### 2.2 `pto.initialize_l2l_pipe` + +创建 local 直传 pipe。生产者将 tile 写入 local FIFO slot,消费者 TPOP 时直接暴露该 slot 给用户。数据不经过 GM。 + +**数据路径:** `local(producer) → local FIFO(consumer)` + +**语法:** + +```mlir +%pipe = pto.initialize_l2l_pipe { + dir_mask = +} + ( [ : i32] ) + -> !pto.pipe +``` + +**参数:** + +| 参数 | 类型 | 说明 | +|---|---|---| +| `dir_mask` | `i8`(属性) | 方向:1 = C2V(Cube→Vector),2 = V2C(Vector→Cube) | +| `local_addr` | `i32`(操作数,可选) | local FIFO 基地址。省略时由 plan memory 分配 | + +**结果:** `!pto.pipe` + +**校验规则:** +- `dir_mask` 必须为 1 或 2 +- `local_addr` 如存在须为 `i32` + +**可用架构:** 仅 A5 + +**示例:** + +```mlir +// C2V,指定 local 地址 +%pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%local_addr : i32) + -> !pto.pipe>, + memref<32x128xf32, #pto.address_space>> + +// V2C,省略地址由 plan memory 分配 +%pipe = pto.initialize_l2l_pipe {dir_mask = 2} + -> !pto.pipe>, + memref<64x128xf32, #pto.address_space>> +``` + +--- + +### 2.3 两种 Pipe 的对比 + +| 特性 | `initialize_l2g2l_pipe` | `initialize_l2l_pipe` | +|---|---|---| +| 数据路径 | local → GM → local | local → local | +| FIFOType | `GM_FIFO` | `VEC_FIFO`(C2V)/ `MAT_FIFO`(V2C) | +| 可用架构 | A3 + A5 | 仅 A5 | +| GM 地址 | 必须 | 无 | +| local 地址 | 可选(plan memory 可分配) | 可选(plan memory 可分配) | +| `local_fifo_depth` | 可选,默认 2 | 不适用(深度 = FiFoDepth) | +| TPOP 行为 | wait → DMA(GM→local) → bind tile | wait → bind tile | +| TPOP 流水线 | `PIPE_MTE2`(涉及 DMA) | `PIPE_S`(仅地址赋值) | + +--- + +## 3. 生产者/消费者指令 + +### 3.1 `pto.tpop` + +等待 pipe 消费者 slot 就绪,返回 slot ID。 + +**语法:** + +```mlir +%slot_id = pto.tpop ( %pipe : !pto.pipe ) -> index +``` + +**参数:** + +| 参数 | 类型 | 说明 | +|---|---|---| +| `pipe_handle` | `!pto.pipe` | 待消费的 pipe | + +**结果:** `index` — 获取到的 ring buffer slot ID。 + +**Traits / Interfaces:** +- 不实现 `OpPipeInterface`(lowering 后由 `tpop_internal` 承载流水线信息) +- `MemoryEffectsOpInterface`:`pipe_handle` Read + Write + +**`%slot_id` 使用约束:** +- 只能被 `pto.get_fifo_tile` 和 `pto.tfree` 使用 +- 必须有且仅有一个 `pto.get_fifo_tile` 消费 +- 必须有且仅有一个 `pto.tfree` 消费 +- `pto.get_fifo_tile` 和 `pto.tfree` 必须使用与 `pto.tpop` 相同的 `pipe_handle` +- 当前实现要求对应的 `pto.get_fifo_tile` 和 `pto.tfree` 与该 `pto.tpop` 位于同一个 block 中 + +--- + +### 3.2 `pto.get_fifo_tile` + +将 slot ID 解析为指向对应 local FIFO entry 的 tile 视图。纯地址计算,不搬运数据。 + +**语法:** + +```mlir +%tile = pto.get_fifo_tile ( %pipe, %slot_id : !pto.pipe, index ) + -> DstTileType +``` + +**参数:** + +| 参数 | 类型 | 说明 | +|---|---|---| +| `pipe_handle` | `!pto.pipe` | 拥有 FIFO 的 pipe | +| `slot_id` | `index` | 由 `pto.tpop` 返回的 slot ID | + +**结果:** `DstTileType` — 从 `pipe.dstTileType` 推导。结果 tile 指向 FIFO slot 在 local buffer 中的存储。 + +**Traits / Interfaces:** +- `ViewLikeOpInterface` +- `MemoryEffectsOpInterface`:`pipe_handle` Read;结果无 Allocate + +**`%tile` 使用约束:** +- 只读:可作为下游指令的源操作数(`ins`),不可作为目标操作数(`outs`) +- 不可在对应的 `pto.tfree` / `pto.tfree_internal` 之后使用 + +--- + +### 3.3 `pto.tfree` + +释放 FIFO slot。必写;每个 `pto.tpop` 都必须显式对应一个 `pto.tfree`。 + +**语法:** + +```mlir +pto.tfree ( %pipe, %slot_id : !pto.pipe, index ) +``` + +**参数:** + +| 参数 | 类型 | 说明 | +|---|---|---| +| `pipe_handle` | `!pto.pipe` | 拥有该 slot 的 pipe | +| `slot_id` | `index` | 待释放的 slot ID | + +**Traits / Interfaces:** +- 不实现 `OpPipeInterface` +- `MemoryEffectsOpInterface`:`pipe_handle` Read + Write + +**使用约束:** +- 必须与对应 `pto.tpop` 的 `pipe_handle` 一致 +- 当前实现要求位于与对应 `pto.tpop` 相同的 block 中 +- 必须出现在对应 `pto.get_fifo_tile` 之后 +- 必须晚于该 borrowed tile 的所有使用 + +--- + +### 3.4 `pto.tpush` + +生产者侧推送。生产者写入 pipe 的后备存储(l2g2l_pipe 写 GM,l2l_pipe 写 local buffer)。 + +```mlir +pto.tpush(%src_tile, %pipe : SrcTileType, !pto.pipe) +``` + +--- + +## 4. 使用示例 + +### 4.1 L2L Pipe(A5 local 直传) + +```mlir +func.func @c2v_l2l_example(%local_addr: i32) { + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%local_addr : i32) + -> !pto.pipe>, + memref<32x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : ...) + } + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe<...>) -> index + %tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe<...>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%tile : ...) outs(%dst : ...) + pto.tfree(%pipe, %slot_id : !pto.pipe<...>, index) + } + return +} +``` + +### 4.2 L2G2L Pipe(经 GM 中转) + +```mlir +func.func @c2v_l2g2l_example(%gm_addr: memref<64x128xf32, #pto.address_space>, + %local_addr: i32) { + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_addr : memref<64x128xf32, #pto.address_space>, %local_addr : i32) + -> !pto.pipe>, + memref<32x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : ...) + } + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe<...>) -> index + %tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe<...>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%tile : ...) outs(%dst : ...) + pto.tfree(%pipe, %slot_id : !pto.pipe<...>, index) + } + return +} +``` + +--- + +## 5. 内部 Op(Lowered IR) + +以下 Op 仅在 `LowerTPop` pass 后出现,不属于前端 IR。 + +### 5.1 `pto.declare_tile` + +声明未绑定地址的 tile,地址在运行时由 `pto.tpop_internal` 赋值。 + +```mlir +%tile = pto.declare_tile -> TileBufType +``` + +### 5.2 `pto.tpop_internal` + +统一的 TPOP,DPS 格式。等待 slot 就绪,绑定 tile 地址到 FIFO slot。l2g2l pipe 还会执行 GM → local 搬运。 + +```mlir +pto.tpop_internal ( %tile, %pipe : TileBufType, !pto.pipe ) + { assigned_pipe = #pto.pipe } +``` + +| pipe 类型 | assigned_pipe | 原因 | +|---|---|---| +| l2g2l_pipe(GM_FIFO) | `PIPE_MTE2` | 涉及 GM → local DMA 搬运 | +| l2l_pipe(VEC/MAT_FIFO) | `PIPE_S` | 仅地址赋值 | + +**运行时语义:** + +``` +1. wait_until(pipe.consumer_slot_ready()) +2. [l2g2l_pipe] dma_copy(pipe.gm_slot_addr() → pipe.local_slot_addr()) +3. tile.rebind_addr(pipe.local_slot_addr()) +4. pipe.advance_consumer_cursor() +``` + +### 5.3 `pto.tfree_internal` + +释放 pipe slot。由显式 frontend `pto.tfree` lowering 生成。 + +```mlir +pto.tfree_internal ( %pipe : !pto.pipe ) +``` + +- `OpPipeInterface`:`PIPE_S`(scalar 操作) +- EmitC:`TFREE(pipe)`(A3 和 A5 均生成,因为都使用 local FIFO) + +--- + +## 6. 编译变换与校验 + +### 6.1 `LowerTPop` Pass + +将前端 `tpop` + `get_fifo_tile` 融合为 `tpop_internal`,并将显式 `tfree` 降级为 `tfree_internal`。 + +**融合 Pattern:** + +``` +输入: 输出: +%sid = pto.tpop(%pipe) %tile = pto.declare_tile -> DstTileType +%tile = pto.get_fifo_tile(%pipe, %sid) pto.tpop_internal(%tile, %pipe) + { assigned_pipe = ... } +``` + +`assigned_pipe` 根据 pipe 的定义 op 确定: +- `initialize_l2g2l_pipe` → `PIPE_MTE2` +- `initialize_l2l_pipe` → `PIPE_S` + +**降级 Pattern:** + +``` +输入: 输出: +pto.tfree(%pipe, %slot_id) pto.tfree_internal(%pipe) +``` + +在 lowering 之前,编译器会先验证: + +- 每个 `pto.tpop` 必须且只能有一个 `pto.get_fifo_tile` +- 每个 `pto.tpop` 必须且只能有一个显式 `pto.tfree` +- `pto.get_fifo_tile` / `pto.tfree` 必须与对应 `pto.tpop` 使用相同 `pipe_handle` +- 当前实现要求三者位于同一个 block 中,且 `pto.tfree` 出现在 `pto.get_fifo_tile` 之后 + +### 6.2 `VerifyTFree` Pass + +对 lowered IR 中的 `tpop_internal` / `tfree_internal` 配对关系做合法性检查。 + +``` +对每个 pto.tpop_internal: + 1. 要求后续同 block 中存在匹配的 tfree_internal + 2. 验证 borrowed tile 不会在 tfree_internal 之后继续使用 + 3. 验证同一 pipe 在匹配 free 之前不会再次 tpop_internal +``` + +也就是说,用户负责显式写出 `pto.tfree`,编译器负责检查释放是否存在、是否足够晚、以及是否满足当前的单 outstanding 约束。 + +### 6.3 Pass Pipeline + +``` +前端 IR + │ + ▼ +LowerTPop + │ - tpop + get_fifo_tile → declare_tile + tpop_internal + │ - tfree → tfree_internal + ▼ +VerifyTFree + │ - 验证显式 tfree_internal 的位置与 outstanding-pop 约束 + ▼ +LoweringSyncToPipe + │ - 高层 sync op → 低层 pipe sync op + ▼ +PTOInsertSync(可选) + │ - 自动同步插入 + ▼ +...(其他 transform)... + ▼ +PTOToEmitC + │ - initialize_l2g2l_pipe → TPipe<..., GM_FIFO, ...>(gm, local) + │ - initialize_l2l_pipe → TPipe<..., VEC/MAT_FIFO, ...>(local) + │ - declare_tile → Tile<...> varname; + │ - tpop_internal → TPOP(tile, pipe); + │ - tfree_internal → TFREE(pipe) + ▼ +C++ 输出 +``` + +--- + +## 7. EmitC Lowering + +### 7.1 Pipe 初始化 → TPipe + +**`initialize_l2l_pipe`(A5):** + +```mlir +%pipe = pto.initialize_l2l_pipe {dir_mask = 1} (%local_addr : i32) +``` + +```cpp +auto v28 = TPipe<0, FIFOType::VEC_FIFO, 8, 8, SrcTile, DstTile>(local_addr); +``` + +**`initialize_l2g2l_pipe`(A3 或 A5):** + +```mlir +%pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_addr : memref<...>, %local_addr : i32) +``` + +```cpp +auto v28 = TPipe<0, FIFOType::GM_FIFO, 8, 8, SrcTile, DstTile>(gm_addr, local_addr); +``` + +**`initialize_l2g2l_pipe`(指定 local_fifo_depth=4):** + +```mlir +%pipe = pto.initialize_l2g2l_pipe {dir_mask = 1, local_fifo_depth = 4} + (%gm_addr : memref<...>, %local_addr : i32) +``` + +```cpp +auto v28 = TPipe<0, FIFOType::GM_FIFO, 8, 8, SrcTile, DstTile, + false, 4>(gm_addr, local_addr); +``` + +**FIFOType 映射:** + +| Op | dir_mask | FIFOType | +|---|---|---| +| `initialize_l2l_pipe` | 1 (C2V) | `VEC_FIFO` | +| `initialize_l2l_pipe` | 2 (V2C) | `MAT_FIFO` | +| `initialize_l2g2l_pipe` | 1 (C2V) | `GM_FIFO` | +| `initialize_l2g2l_pipe` | 2 (V2C) | `GM_FIFO` | + +### 7.2 其他 Op + +| Op | EmitC 输出 | +|---|---| +| `declare_tile` | `Tile<...> varname;` | +| `tpop_internal` | `TPOP(tile, pipe);` | +| `tfree_internal` | `TFREE(pipe);` | + +--- + +## 8. Op 定义汇总 + +### 前端 Op + +| Op | 操作数 | 结果 | Pipeline | 可见性 | +|---|---|---|---|---| +| `pto.initialize_l2g2l_pipe` | `gm_addr [, local_addr]` + 属性 | `!pto.pipe` | — | 前端 | +| `pto.initialize_l2l_pipe` | `[local_addr]` + 属性 | `!pto.pipe` | — | 前端 | +| `pto.tpop` | `pipe` | `index`(slot_id) | — | 前端 | +| `pto.get_fifo_tile` | `pipe, slot_id` | `DstTileType` | — | 前端 | +| `pto.tfree` | `pipe, slot_id` | — | — | 前端 | +| `pto.tpush` | `tile, pipe` | — | — | 前端 | + +### 内部 Op + +| Op | 操作数 | 结果 | DPS | Pipeline | +|---|---|---|---|---| +| `pto.declare_tile` | — | `TileBufType` | 否 | — | +| `pto.tpop_internal` | `tile, pipe` + `assigned_pipe` | — | 是 | GM_FIFO: `PIPE_MTE2`;VEC/MAT_FIFO: `PIPE_S` | +| `pto.tfree_internal` | `pipe` | — | 否 | `PIPE_S` | + +--- + +## 9. Tile 大小与 Slot Size + +**slot_size 定义:** FIFO 中每个 slot 的大小,取生产者和消费者 tile 中较大的: + +``` +slot_size = max(srcTile.size, dstTile.size) +``` + +其中 `tile.size = Rows × Cols × sizeof(DType)`。 + +**srcTile 与 dstTile 的三种关系:** + +| 关系 | 条件 | slot_size | 典型场景 | +|---|---|---|---| +| 1:1 相等 | `src.size == dst.size` | 任一 | A3(Cube/Vec 算力 1:1),A5 单 Vec 核 | +| Src 为 Dst 的 2 倍 | `src.size == 2 × dst.size` | src.size | A5 C2V(1 Cube : 2 Vector) | +| Dst 为 Src 的 2 倍 | `dst.size == 2 × src.size` | dst.size | A5 V2C | + +**plan memory 分配量 = `local_fifo_depth × slot_size`** + +--- + +## 10. C++ TPipe 模板参考 + +### 10.1 模板定义(pto-isa) + +```cpp +template +struct TPipe; +``` + +### 10.2 参数说明 + +| # | 参数 | 类型 | 说明 | PTOAS 来源 | +|---|---|---|---|---| +| 1 | `FlagID` | uint8_t | 同步 flag 基地址 | 编译器 flag 分配 pass | +| 2 | `FiFoType` | FIFOType | FIFO 类型 | 由 Op 类型 + dir_mask 推导 | +| 3 | `FiFoDepth` | uint8_t | ring buffer slot 数量 | dir_mask 推导(单向=8) | +| 4 | `FiFoSyncT` | uint8_t | 同步周期 | C++ 默认值(=FiFoDepth) | +| 5 | `TileDataProd` | typename | 生产者 tile 类型 | pipe result type 的 srcTileType | +| 6 | `TileDataCons` | typename | 消费者 tile 类型 | pipe result type 的 dstTileType | +| 7 | `EN_UNIT_FLAG` | bool | unit flag 优化 | C++ 默认值(false) | +| 8 | `LocalFiFoDepth` | uint8_t | GM_FIFO 的 local buffer 深度 | `local_fifo_depth` 属性,默认 2 | +| 9 | `VCRatio` | VecCubeRatio | Vector/Cube 核心比率 | C++ 默认值(V2C1_VECS) | + +### 10.3 VecCubeRatio 枚举 + +```cpp +enum class VecCubeRatio : uint8_t { + V1C1_VEC0 = 0, // 1 Vector : 1 Cube,仅 Vector 0 + V1C1_VEC1 = 1, // 1 Vector : 1 Cube,仅 Vector 1 + V2C1_VECS = 2, // 2 Vector : 1 Cube(默认) +}; +``` + +VCRatio 不可从 tile 大小推导——`src.size == dst.size` 时 V1C1 与 V2C1 均合法。当前使用 C++ 默认值 V2C1_VECS。 + +### 10.4 FIFOType 与行为 + +| FIFOType | 对应 Op | 构造函数参数 | TPUSH | TPOP | +|---|---|---|---|---| +| `VEC_FIFO` | `initialize_l2l_pipe` (C2V) | local addr | 写入 local VEC slot | 返回 local slot tile | +| `MAT_FIFO` | `initialize_l2l_pipe` (V2C) | local addr | 写入 local MAT slot | 返回 local slot tile | +| `GM_FIFO` | `initialize_l2g2l_pipe` | GM addr + local addr | 写入 GM slot | GM → local → 返回 tile | + +--- + +## 11. 端到端示例:EmitC 输出 + +### 11.1 L2L Pipe(A5) + +```cpp +__global__ AICORE void c2v_l2l(int32_t local_addr) { + auto v28 = TPipe<0, FIFOType::VEC_FIFO, 8, 8, AccTile, VecTile>(local_addr); + + // section.cube + TPUSH(acc_tile, v28); + + // section.vector + Tile v30; + TPOP(v30, v28); + TMOV(dst, v30); + TFREE(v28); +} +``` + +### 11.2 L2G2L Pipe(A3 或 A5) + +```cpp +__global__ AICORE void c2v_l2g2l(memref gm_addr, int32_t local_addr) { + auto v28 = TPipe<0, FIFOType::GM_FIFO, 8, 8, AccTile, VecTile>(gm_addr, local_addr); + + // section.cube + TPUSH(acc_tile, v28); + + // section.vector + Tile v30; + TPOP(v30, v28); + TMOV(dst, v30); + TFREE(v28); +} +``` diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index c967a75e..328d5a83 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -3645,4 +3645,222 @@ def TPrintOp: PTO_TOp<"tprint", [ }]; } +//===----------------------------------------------------------------------===// +// TPUSH/TPOP Ring Buffer Communication Ops +//===----------------------------------------------------------------------===// + +// --- Initialization --- + +def InitializeL2G2LPipeOp : PTO_Op<"initialize_l2g2l_pipe", [ + DeclareOpInterfaceMethods +]> { + let summary = "Initialize a local-to-global-to-local pipe handle"; + let description = [{ + Called once at kernel startup. Binds a pipe whose producer writes through GM + and whose consumer exposes a local FIFO slot. + + - gm_addr is required and must be a memref with gm address space. + - local_addr is optional; if omitted, plan memory allocates it. + - local_fifo_depth is optional; if present, it must be positive. + - dir_mask must be 1 (C2V) or 2 (V2C). + }]; + + let arguments = (ins + I8Attr:$dir_mask, + OptionalAttr:$local_fifo_depth, + AnyType:$gm_addr, + Optional:$local_addr + ); + + let results = (outs PipeType:$pipe); + let hasVerifier = 1; + + let assemblyFormat = [{ + `{` `dir_mask` `=` $dir_mask + (`,` `local_fifo_depth` `=` $local_fifo_depth^)? `}` + `(` $gm_addr `:` type($gm_addr) + (`,` $local_addr^ `:` type($local_addr))? `)` + attr-dict `->` qualified(type($pipe)) + }]; +} + +def InitializeL2LPipeOp : PTO_Op<"initialize_l2l_pipe", [ + DeclareOpInterfaceMethods +]> { + let summary = "Initialize a local-to-local pipe handle"; + let description = [{ + Called once at kernel startup. Binds a pipe whose producer and consumer + communicate through local FIFO storage. + + - local_addr is optional; if omitted, plan memory allocates it. + - dir_mask must be 1 (C2V) or 2 (V2C). + }]; + + let arguments = (ins + I8Attr:$dir_mask, + Variadic:$local_addrs + ); + + let results = (outs PipeType:$pipe); + let hasVerifier = 1; + + let assemblyFormat = [{ + `{` `dir_mask` `=` $dir_mask `}` + (`(` $local_addrs^ `:` type($local_addrs) `)`)? + attr-dict `->` qualified(type($pipe)) + }]; +} + +// --- Data Transfer: Push (producer, no DPS) --- + +def TPushOp : PTO_TOp<"tpush", [ + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Push tile data via unified pipe handle"; + + let arguments = (ins + PTODpsType:$tile, + PipeType:$pipe_handle + ); + + let results = (outs); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $tile `,` $pipe_handle `:` qualified(type($tile)) `,` qualified(type($pipe_handle)) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } + }]; +} + +// --- Data Transfer: Frontend consumer ops --- + +def TPopOp : PTO_Op<"tpop", [ + DeclareOpInterfaceMethods +]> { + let summary = "Wait for a consumer slot and return its slot id"; + + let arguments = (ins + PipeType:$pipe_handle + ); + + let results = (outs Index:$slot_id); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $pipe_handle `:` qualified(type($pipe_handle)) `)` + attr-dict `->` type($slot_id) + }]; +} + +def GetFifoTileOp : PTO_Op<"get_fifo_tile", [ + ViewLikeOpInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Map a tpop slot id to the corresponding FIFO tile view"; + + let arguments = (ins + PipeType:$pipe_handle, + Index:$slot_id + ); + + let results = (outs PTODpsType:$tile); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $pipe_handle `,` $slot_id `:` qualified(type($pipe_handle)) `,` type($slot_id) `)` + attr-dict `->` qualified(type($tile)) + }]; + + let extraClassDeclaration = [{ + ::mlir::Value getViewSource() { return getPipeHandle(); } + }]; +} + +def TFreeOp : PTO_Op<"tfree", [ + DeclareOpInterfaceMethods +]> { + let summary = "Release a consumer slot after the borrowed FIFO tile is no longer needed"; + + let arguments = (ins + PipeType:$pipe_handle, + Index:$slot_id + ); + + let results = (outs); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $pipe_handle `,` $slot_id `:` qualified(type($pipe_handle)) `,` type($slot_id) `)` + attr-dict + }]; +} + +// --- Data Transfer: Internal lowered consumer ops --- + +def DeclareTileOp : PTO_Op<"declare_tile", [Pure]> { + let summary = "Declare a tile value whose address will be rebound later"; + + let results = (outs PTODpsType:$tile); + + let assemblyFormat = [{ + attr-dict `->` qualified(type($tile)) + }]; +} + +def TPopInternalOp : PTO_TOp<"tpop_internal", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Lowered consumer pop that binds a declared tile to a FIFO slot"; + + let arguments = (ins + PTODpsType:$tile, + PipeType:$pipe_handle, + PTO_PipeAttr:$assigned_pipe + ); + + let results = (outs); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $tile `,` $pipe_handle `:` qualified(type($tile)) `,` qualified(type($pipe_handle)) `)` + `{` `assigned_pipe` `=` $assigned_pipe `}` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return getAssignedPipe().getPipe(); } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getTileMutable(); } + }]; +} + +def TFreeInternalOp : PTO_TOp<"tfree_internal", [ + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Lowered consumer slot release"; + + let arguments = (ins + PipeType:$pipe_handle + ); + + let results = (outs); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $pipe_handle `:` qualified(type($pipe_handle)) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; } + }]; +} + #endif // MLIR_DIALECT_PTO_IR_PTOOPS diff --git a/include/PTO/IR/PTOTypeDefs.td b/include/PTO/IR/PTOTypeDefs.td index d4a79746..4a3bc569 100644 --- a/include/PTO/IR/PTOTypeDefs.td +++ b/include/PTO/IR/PTOTypeDefs.td @@ -184,3 +184,15 @@ def TileBufType : TypeDef { int32_t getPadValueI32() const; // 0 null, 1 zero, 2 max, 3 min }]; } + +def PipeType : TypeDef { + let mnemonic = "pipe"; + let summary = "Pipe handle type for TPUSH/TPOP unified static schedule"; + let parameters = (ins + "mlir::Type":$srcTileType, + "mlir::Type":$dstTileType + ); + let assemblyFormat = [{ + `<` $srcTileType `,` $dstTileType `>` + }]; +} diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h index 3df9390d..b28ad7d3 100644 --- a/include/PTO/Transforms/Passes.h +++ b/include/PTO/Transforms/Passes.h @@ -38,6 +38,7 @@ enum class PTOArch { std::unique_ptr createPTOHighDimLoweringPass(); std::unique_ptr createPTOVFloopGatherPass(); std::unique_ptr createLoweringSyncToPipePass(); +std::unique_ptr createPTOLowerTPopPass(); // Creates a pass for ... std::unique_ptr createPTOInsertSyncPass(); @@ -64,7 +65,8 @@ std::unique_ptr createPTORemoveRedundantBarrierPass(); std::unique_ptr createPTOViewToMemrefPass(); std::unique_ptr createPTOInsertLoadStoreForMixCVPass(); std::unique_ptr createInferPTOLayoutPass(); -// Declare register function +std::unique_ptr createPTOVerifyTFreePass(); + void registerPTOPasses(); } // namespace pto diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index ab7f29df..32d41c0e 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -116,4 +116,38 @@ def PTOLoweringSyncToPipe : Pass<"pto-lowering-sync-to-pipe", "func::FuncOp"> { ]; } +def PTOLowerTPop : Pass<"pto-lower-tpop", "func::FuncOp"> { + let summary = "Lower slot-based tpop/get_fifo_tile/tfree to internal pipe consumer ops"; + let description = [{ + Rewrites frontend pipe-consumer IR into internal ops: + - `pto.tpop + pto.get_fifo_tile` -> `pto.declare_tile + pto.tpop_internal` + - `pto.tfree` -> `pto.tfree_internal` + Before lowering, verifies that each `pto.tpop` result has exactly one + matching `pto.get_fifo_tile` and exactly one explicit `pto.tfree`. + }]; + + let constructor = "mlir::pto::createPTOLowerTPopPass()"; + + let dependentDialects = [ + "mlir::pto::PTODialect" + ]; +} + +def PTOVerifyTFree : Pass<"pto-verify-tfree", "func::FuncOp"> { + let summary = "Verify lowered pto.tfree_internal placement for explicit tpop/tfree pairs"; + let description = [{ + For each lowered `pto.tpop_internal` in section.cube / section.vector, this + pass verifies that there is a matching explicit + `pto.tfree_internal(pipe_handle)` later in the same block, that the + borrowed tile is not used after that free, and that the same pipe does not + accumulate multiple outstanding pops before the matching free. + }]; + + let constructor = "mlir::pto::createPTOVerifyTFreePass()"; + + let dependentDialects = [ + "mlir::pto::PTODialect" + ]; +} + #endif // MLIR_DIALECT_PTO_PASSES diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 11514b56..10415698 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -2006,6 +2006,165 @@ LogicalResult RlsBufOp::verify() { return verifyBufSyncOp(getOperation(), getPipe(), getBufIdAttr(), getModeAttr()); } + +//===----------------------------------------------------------------------===// +// TPUSH/TPOP Op Verifiers +//===----------------------------------------------------------------------===// + +static bool isInsideSection(Operation *op) { + return op->getParentOfType() || + op->getParentOfType(); +} + +static bool isInsideSectionCube(Operation *op) { + return op->getParentOfType() != nullptr; +} + +static bool isInsideSectionVector(Operation *op) { + return op->getParentOfType() != nullptr; +} + +static LogicalResult verifyInitPipeDirMask(Operation *op, int8_t dirMask) { + if (isInsideSection(op)) + return op->emitOpError("must be at function level, not inside a section"); + if (dirMask != 1 && dirMask != 2) + return op->emitOpError("dir_mask must be 1 (C2V) or 2 (V2C)"); + return success(); +} + +LogicalResult InitializeL2G2LPipeOp::verify() { + if (failed(verifyInitPipeDirMask(getOperation(), getDirMask()))) + return failure(); + + auto pipeTy = dyn_cast(getPipe().getType()); + if (!pipeTy) + return emitOpError("result type must be !pto.pipe<...>"); + + auto memTy = dyn_cast(getGmAddr().getType()); + if (!memTy || !isGmAddressSpaceAttr(memTy.getMemorySpace())) + return emitOpError("gm_addr must be memref with #pto.address_space"); + + if (Value localAddr = getLocalAddr()) { + if (!localAddr.getType().isInteger(32)) + return emitOpError("local_addr must be i32 when provided"); + } + + if (auto depthAttr = getLocalFifoDepthAttr()) { + if (depthAttr.getInt() <= 0) + return emitOpError("local_fifo_depth must be a positive i8 attribute"); + } + return success(); +} + +LogicalResult InitializeL2LPipeOp::verify() { + if (failed(verifyInitPipeDirMask(getOperation(), getDirMask()))) + return failure(); + + auto pipeTy = dyn_cast(getPipe().getType()); + if (!pipeTy) + return emitOpError("result type must be !pto.pipe<...>"); + + OperandRange localAddrs = getLocalAddrs(); + if (localAddrs.size() > 1) + return emitOpError("accepts at most one local_addr operand"); + + if (localAddrs.size() == 1) { + if (!localAddrs.front().getType().isInteger(32)) + return emitOpError("local_addr must be i32 when provided"); + } + return success(); +} + +static LogicalResult verifyPipeTileType(Operation *op, Type pipeType, + Type tileType, bool isPush) { + auto pipeTy = dyn_cast(pipeType); + if (!pipeTy) + return op->emitOpError("expects pipe operand type !pto.pipe<...>"); + + Type expected = isPush ? pipeTy.getSrcTileType() : pipeTy.getDstTileType(); + if (tileType != expected) { + return op->emitOpError(isPush ? "tile type must match pipe src tile type" + : "tile type must match pipe dst tile type"); + } + return success(); +} + +LogicalResult TPushOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + return verifyPipeTileType(getOperation(), getPipeHandle().getType(), + getTile().getType(), /*isPush=*/true); +} + +static FailureOr getConsumerAssignedPipe(Value pipeHandle) { + if (pipeHandle.getDefiningOp()) + return pto::PIPE::PIPE_MTE2; + if (pipeHandle.getDefiningOp()) + return pto::PIPE::PIPE_S; + return failure(); +} + +LogicalResult TPopOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + return success(); +} + +LogicalResult GetFifoTileOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + + auto tpopOp = getSlotId().getDefiningOp(); + if (!tpopOp) + return emitOpError("slot_id must be produced by pto.tpop"); + if (getPipeHandle() != tpopOp.getPipeHandle()) + return emitOpError("pipe_handle must match the pto.tpop that produced slot_id"); + + return verifyPipeTileType(getOperation(), getPipeHandle().getType(), + getTile().getType(), /*isPush=*/false); +} + +LogicalResult TFreeOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + + auto tpopOp = getSlotId().getDefiningOp(); + if (!tpopOp) + return emitOpError("slot_id must be produced by pto.tpop"); + if (getPipeHandle() != tpopOp.getPipeHandle()) + return emitOpError("pipe_handle must match the pto.tpop that produced slot_id"); + return success(); +} + +LogicalResult TPopInternalOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + + if (failed(verifyPipeTileType(getOperation(), getPipeHandle().getType(), + getTile().getType(), /*isPush=*/false))) + return failure(); + + pto::PIPE assignedPipe = getAssignedPipe().getPipe(); + if (assignedPipe == pto::PIPE::PIPE_ALL || + assignedPipe == pto::PIPE::PIPE_UNASSIGNED) { + return emitOpError( + "assigned_pipe must be a concrete pipe, not PIPE_ALL/PIPE_UNASSIGNED"); + } + + auto expectedPipe = getConsumerAssignedPipe(getPipeHandle()); + if (failed(expectedPipe)) + return emitOpError("pipe_handle must be produced by a pipe initialization op"); + if (assignedPipe != *expectedPipe) + return emitOpError("assigned_pipe does not match the pipe_handle kind"); + return success(); +} + +LogicalResult TFreeInternalOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + return success(); +} + // ---- TOp ---- LogicalResult TGemvBiasOp::verify() { if (getPTOTypeRank(getA().getType()) == -1 || @@ -4441,6 +4600,66 @@ void TMatmulMxBiasOp::getEffects(SmallVectorImpl> + &effects) { + addEffect(effects, &getGmAddrMutable(), MemoryEffects::Read::get()); + if (getLocalAddr()) + addEffect(effects, &getOperation()->getOpOperand(1), MemoryEffects::Read::get()); + addEffect(effects, getOperation()->getOpResult(0), MemoryEffects::Write::get()); +} + +void InitializeL2LPipeOp::getEffects( + SmallVectorImpl> + &effects) { + for (unsigned i = 0; i < getLocalAddrs().size(); ++i) + addEffect(effects, &getOperation()->getOpOperand(i), MemoryEffects::Read::get()); + addEffect(effects, getOperation()->getOpResult(0), MemoryEffects::Write::get()); +} + +void TPushOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getTileMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); +} + +void TPopOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); +} + +void GetFifoTileOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); +} + +void TFreeOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); +} + +void TPopInternalOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); + addEffect(effects, &getTileMutable(), MemoryEffects::Write::get()); +} + +void TFreeInternalOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); +} + // [Include 必须放在最后] #include "PTO/IR/PTOInterfaces.cpp.inc" #define GET_OP_CLASSES diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt index d9d013c9..b19a3fde 100644 --- a/lib/PTO/Transforms/CMakeLists.txt +++ b/lib/PTO/Transforms/CMakeLists.txt @@ -17,6 +17,7 @@ add_mlir_dialect_library(PTOTransforms ConvertToPTOOp.cpp PTOHighDimLowering.cpp PTOVFloopGather.cpp + PTOLowerTPopPass.cpp InsertSync/PTOIRTranslator.cpp InsertSync/SyncCommon.cpp InsertSync/InsertSyncAnalysis.cpp @@ -26,6 +27,7 @@ add_mlir_dialect_library(PTOTransforms InsertSync/SyncEventIdAllocation.cpp InsertSync/SyncCodegen.cpp LoweringSyncToPipe.cpp + PTOVerifyTFreePass.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include/PTO diff --git a/lib/PTO/Transforms/PTOLowerTPopPass.cpp b/lib/PTO/Transforms/PTOLowerTPopPass.cpp new file mode 100644 index 00000000..c6fa9c2e --- /dev/null +++ b/lib/PTO/Transforms/PTOLowerTPopPass.cpp @@ -0,0 +1,170 @@ +#include "PTO/IR/PTO.h" +#include "PTO/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir { +namespace pto { +namespace func = ::mlir::func; +#define GEN_PASS_DEF_PTOLOWERTPOP +#include "PTO/Transforms/Passes.h.inc" +} // namespace pto +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +namespace { + +static FailureOr getAssignedPipeForConsumer(Value pipeHandle) { + if (pipeHandle.getDefiningOp()) + return pto::PIPE::PIPE_MTE2; + if (pipeHandle.getDefiningOp()) + return pto::PIPE::PIPE_S; + return failure(); +} + +static LogicalResult validateSlotUsers(func::FuncOp funcOp) { + WalkResult walkResult = funcOp.walk([&](TPopOp op) { + GetFifoTileOp getFifoTileOp; + TFreeOp tfreeOp; + + for (OpOperand &use : op.getSlotId().getUses()) { + Operation *user = use.getOwner(); + if (auto getTile = dyn_cast(user)) { + if (getFifoTileOp) { + op.emitError("slot_id must have exactly one pto.get_fifo_tile user"); + return WalkResult::interrupt(); + } + if (getTile.getPipeHandle() != op.getPipeHandle()) { + op.emitError( + "slot_id users must use the same pipe_handle as the producing pto.tpop"); + return WalkResult::interrupt(); + } + getFifoTileOp = getTile; + continue; + } + if (auto tfree = dyn_cast(user)) { + if (tfreeOp) { + op.emitError("slot_id must have exactly one pto.tfree user"); + return WalkResult::interrupt(); + } + if (tfree.getPipeHandle() != op.getPipeHandle()) { + op.emitError( + "slot_id users must use the same pipe_handle as the producing pto.tpop"); + return WalkResult::interrupt(); + } + tfreeOp = tfree; + continue; + } + + op.emitError( + "slot_id can only be used by pto.get_fifo_tile or pto.tfree"); + return WalkResult::interrupt(); + } + + if (!getFifoTileOp) { + op.emitError("slot_id must have exactly one pto.get_fifo_tile user"); + return WalkResult::interrupt(); + } + if (!tfreeOp) { + op.emitError("slot_id must have exactly one pto.tfree user"); + return WalkResult::interrupt(); + } + + if (getFifoTileOp->getBlock() != op->getBlock()) { + op.emitError( + "pto.get_fifo_tile must be in the same block as the producing pto.tpop"); + return WalkResult::interrupt(); + } + if (tfreeOp->getBlock() != op->getBlock()) { + op.emitError( + "pto.tfree must be in the same block as the producing pto.tpop"); + return WalkResult::interrupt(); + } + if (!getFifoTileOp->isBeforeInBlock(tfreeOp)) { + op.emitError( + "pto.tfree must appear after the corresponding pto.get_fifo_tile"); + return WalkResult::interrupt(); + } + + return WalkResult::advance(); + }); + + return walkResult.wasInterrupted() ? failure() : success(); +} + +struct LowerGetFifoTilePattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(GetFifoTileOp op, + PatternRewriter &rewriter) const override { + auto tpopOp = op.getSlotId().getDefiningOp(); + if (!tpopOp) + return rewriter.notifyMatchFailure(op, "slot_id must come from pto.tpop"); + + auto assignedPipe = getAssignedPipeForConsumer(op.getPipeHandle()); + if (failed(assignedPipe)) { + return rewriter.notifyMatchFailure( + op, "pipe_handle must be produced by a pipe initialization op"); + } + + auto declaredTile = + rewriter.create(op.getLoc(), op.getTile().getType()); + rewriter.create( + op.getLoc(), declaredTile.getTile(), op.getPipeHandle(), + PipeAttr::get(rewriter.getContext(), *assignedPipe)); + rewriter.replaceOp(op, declaredTile.getTile()); + return success(); + } +}; + +struct LowerTFreePattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TFreeOp op, + PatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp(op, op.getPipeHandle()); + return success(); + } +}; + +struct EraseLoweredTPopPattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TPopOp op, + PatternRewriter &rewriter) const override { + if (!op.getSlotId().use_empty()) + return failure(); + rewriter.eraseOp(op); + return success(); + } +}; + +struct PTOLowerTPopPass + : public mlir::pto::impl::PTOLowerTPopBase { + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + MLIRContext *context = &getContext(); + + if (failed(validateSlotUsers(funcOp))) { + signalPassFailure(); + return; + } + + RewritePatternSet patterns(context); + patterns.add(context); + + if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) + signalPassFailure(); + } +}; + +} // namespace + +std::unique_ptr mlir::pto::createPTOLowerTPopPass() { + return std::make_unique(); +} diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp index 24d24171..6f67460f 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.cpp +++ b/lib/PTO/Transforms/PTOPlanMemory.cpp @@ -160,6 +160,11 @@ void MemLivenessAnalysis::RecursionIR(Region *region, Liveness live) { UpdateOpGenInfo(curOpInfo, llvm::to_vector(gpuLaunchOp->getOperands())); // UpdateOpTempGenInfo(curOpInfo); OpKillHandle(curOpInfo, live, op->getBlock()); + } else if (isa(op)) { + UpdateOpGenInfo(curOpInfo, llvm::to_vector(op->getOperands())); + OpKillHandle(curOpInfo, live, op->getBlock()); } else if (failed(CheckIfUnknownOpTouchBuffer(op))) { return WalkResult::interrupt(); } diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index fd868f27..1830f62d 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -79,6 +79,182 @@ static const char *addrSpaceQualifier(pto::AddressSpace as) { return "__gm__"; } +static std::string getEmitCElementTypeToken(Type elemTy) { + if (elemTy.isF16()) + return "half"; + if (elemTy.isBF16()) + return "bfloat16_t"; + if (elemTy.isF32()) + return "float"; + if (elemTy.isF64()) + return "double"; + if (elemTy.isInteger(8)) { + auto intTy = cast(elemTy); + return (intTy.isSignless() || intTy.isSigned()) ? "int8_t" : "uint8_t"; + } + if (elemTy.isInteger(16)) { + auto intTy = cast(elemTy); + return (intTy.isSignless() || intTy.isSigned()) ? "int16_t" : "uint16_t"; + } + if (elemTy.isInteger(32)) { + auto intTy = cast(elemTy); + return (intTy.isSignless() || intTy.isSigned()) ? "int32_t" : "uint32_t"; + } + if (elemTy.isInteger(64)) { + auto intTy = cast(elemTy); + return intTy.isUnsigned() ? "uint64_t" : "int64_t"; + } + return ""; +} + +static const char *getEmitCTileRoleToken(pto::AddressSpace as) { + switch (as) { + case pto::AddressSpace::VEC: + return "TileType::Vec"; + case pto::AddressSpace::MAT: + return "TileType::Mat"; + case pto::AddressSpace::LEFT: + return "TileType::Left"; + case pto::AddressSpace::RIGHT: + return "TileType::Right"; + case pto::AddressSpace::ACC: + return "TileType::Acc"; + case pto::AddressSpace::BIAS: + return "TileType::Bias"; + case pto::AddressSpace::SCALING: + return "TileType::Scaling"; + case pto::AddressSpace::GM: + case pto::AddressSpace::Zero: + return "TileType::Vec"; + } + return "TileType::Vec"; +} + +struct EmitCTileLayoutDefaults { + const char *blTok; + const char *slTok; + int fractalBytes; + const char *padTok; +}; + +static EmitCTileLayoutDefaults +getEmitCTileLayoutDefaults(pto::AddressSpace as, PTOArch targetArch) { + switch (as) { + case pto::AddressSpace::ACC: + return {"BLayout::ColMajor", "SLayout::RowMajor", 1024, "PadValue::Null"}; + case pto::AddressSpace::RIGHT: + return {"BLayout::RowMajor", "SLayout::ColMajor", 512, "PadValue::Null"}; + case pto::AddressSpace::LEFT: + if (targetArch == PTOArch::A3) { + // pto-isa A2/A3: TileLeft -> BLayout::RowMajor + SLayout::RowMajor. + return {"BLayout::RowMajor", "SLayout::RowMajor", 512, + "PadValue::Null"}; + } + // pto-isa A5: TileLeft -> BLayout::ColMajor + SLayout::RowMajor. + return {"BLayout::ColMajor", "SLayout::RowMajor", 512, "PadValue::Null"}; + case pto::AddressSpace::MAT: + return {"BLayout::ColMajor", "SLayout::RowMajor", 512, "PadValue::Null"}; + case pto::AddressSpace::VEC: + case pto::AddressSpace::BIAS: + case pto::AddressSpace::SCALING: + case pto::AddressSpace::GM: + case pto::AddressSpace::Zero: + return {"BLayout::RowMajor", "SLayout::NoneBox", 512, "PadValue::Null"}; + } + return {"BLayout::RowMajor", "SLayout::NoneBox", 512, "PadValue::Null"}; +} + +static FailureOr getEmitCTileTypeTokenFromType(Type ty, + PTOArch targetArch) { + int64_t rows = ShapedType::kDynamic; + int64_t cols = ShapedType::kDynamic; + Type elemTy; + pto::AddressSpace as = pto::AddressSpace::VEC; + + if (auto mrTy = dyn_cast(ty)) { + if (mrTy.getRank() < 2 || !mrTy.hasStaticShape()) + return failure(); + rows = mrTy.getDimSize(0); + cols = mrTy.getDimSize(1); + elemTy = mrTy.getElementType(); + if (auto asAttr = + dyn_cast_or_null(mrTy.getMemorySpace())) + as = asAttr.getAddressSpace(); + } else if (auto tileTy = dyn_cast(ty)) { + if (tileTy.getRank() < 2) + return failure(); + rows = tileTy.getShape()[0]; + cols = tileTy.getShape()[1]; + elemTy = tileTy.getElementType(); + if (auto asAttr = + dyn_cast_or_null(tileTy.getMemorySpace())) + as = asAttr.getAddressSpace(); + } else { + return failure(); + } + + if (rows == ShapedType::kDynamic || cols == ShapedType::kDynamic) + return failure(); + + std::string elemTok = getEmitCElementTypeToken(elemTy); + if (elemTok.empty()) + return failure(); + + const char *roleTok = getEmitCTileRoleToken(as); + auto defaults = getEmitCTileLayoutDefaults(as, targetArch); + return std::string("Tile<") + roleTok + ", " + elemTok + ", " + + std::to_string(rows) + ", " + std::to_string(cols) + + ", " + defaults.blTok + ", " + std::to_string(rows) + ", " + + std::to_string(cols) + ", " + defaults.slTok + ", " + + std::to_string(defaults.fractalBytes) + ", " + defaults.padTok + ">"; +} + +static bool isPipeInitOp(Operation *op) { + return isa(op); +} + +static FailureOr allocateFlagBaseForInitOp(Operation *op) { + static constexpr int kFlagBasePool[] = {0, 2, 4, 6, 8, 10, 12, 14}; + static constexpr int kFlagBasePoolSize = 8; + int idx = 0; + for (Operation &candidate : *op->getBlock()) { + if (isPipeInitOp(&candidate)) { + if (&candidate == op) + break; + ++idx; + } + } + if (idx < 0 || idx >= kFlagBasePoolSize) + return failure(); + return kFlagBasePool[idx]; +} + +static FailureOr +getTPipePipeTypeTokenForSplitInit(bool isL2G2L, int8_t dirMask) { + if (isL2G2L) + return std::string("FIFOType::GM_FIFO"); + if (dirMask == 1) + return std::string("FIFOType::VEC_FIFO"); + if (dirMask == 2) + return std::string("FIFOType::MAT_FIFO"); + return failure(); +} + +static std::string buildTPipeToken(int flagBase, const std::string &pipeTypeToken, + int slotNum, llvm::StringRef srcTok, + llvm::StringRef dstTok, + std::optional localFifoDepth = std::nullopt) { + std::string tpipeTok = + "TPipe<" + std::to_string(flagBase) + ", " + pipeTypeToken + ", " + + std::to_string(slotNum) + ", " + std::to_string(slotNum) + ", " + + srcTok.str() + ", " + dstTok.str(); + if (localFifoDepth && *localFifoDepth != 2) + tpipeTok += ", false, " + std::to_string(*localFifoDepth); + tpipeTok += ">"; + return tpipeTok; +} + static Value peelUnrealized(Value v) { if (auto castOp = v.getDefiningOp()) return castOp.getOperand(0); @@ -176,6 +352,13 @@ class PTOToEmitCTypeConverter : public TypeConverter { emitc::OpaqueType::get(Ctx, finalTypeStr)); }); + addConversion([Ctx](pto::PipeType type) -> Type { + // Pipe initialization ops return a pipe-dependent handle type in C++. + // Keep it as `auto` to avoid hard-coding a non-existent concrete pipe type. + (void)type; + return emitc::OpaqueType::get(Ctx, "auto"); + }); + // --------------------------------------------------------- // 3. MemRef 转换 (Debug 重点) // --------------------------------------------------------- @@ -2818,6 +3001,11 @@ struct PointerCastConversion : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; + PointerCastConversion(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + enum class TileRole { Vec, Mat, Left, Right, Acc, Bias, Scaling }; static void collectUserOpsThroughCasts(Value v, SmallVectorImpl &out) { @@ -2876,6 +3064,26 @@ struct PointerCastConversion : public OpConversionPattern { return TileRole::Vec; } + EmitCTileLayoutDefaults getDefaultLayoutForRole(TileRole role) const { + switch (role) { + case TileRole::Left: + return getEmitCTileLayoutDefaults(pto::AddressSpace::LEFT, targetArch); + case TileRole::Right: + return getEmitCTileLayoutDefaults(pto::AddressSpace::RIGHT, targetArch); + case TileRole::Acc: + return getEmitCTileLayoutDefaults(pto::AddressSpace::ACC, targetArch); + case TileRole::Bias: + return getEmitCTileLayoutDefaults(pto::AddressSpace::BIAS, targetArch); + case TileRole::Mat: + return getEmitCTileLayoutDefaults(pto::AddressSpace::MAT, targetArch); + case TileRole::Scaling: + return getEmitCTileLayoutDefaults(pto::AddressSpace::SCALING, targetArch); + case TileRole::Vec: + return getEmitCTileLayoutDefaults(pto::AddressSpace::VEC, targetArch); + } + return getEmitCTileLayoutDefaults(pto::AddressSpace::VEC, targetArch); + } + // [新增] 辅助函数:判断 Value 是否源自 arith.constant static bool isConstant(Value v, int64_t &outVal) { if (!v) return false; @@ -2931,9 +3139,12 @@ struct PointerCastConversion : public OpConversionPattern { case TileRole::Scaling: roleTok = "TileType::Scaling"; break; } - // 4. Config & Layout (support BLayoutAttr/SLayoutAttr/PadValueAttr after namespace change) - std::string layoutParams = "BLayout::RowMajor"; - std::string extraParams = ""; + // 4. Config & Layout (default by tile role; explicit config overrides) + auto defaults = getDefaultLayoutForRole(role); + std::string layoutParams = defaults.blTok; + std::string slStr = defaults.slTok; + int32_t frVal = defaults.fractalBytes; + std::string padStr = defaults.padTok; if (auto configOpt = op.getConfig()) { auto config = *configOpt; int32_t blVal = 0; @@ -2946,25 +3157,21 @@ struct PointerCastConversion : public OpConversionPattern { if (auto attr = dyn_cast(config.getSLayout())) slVal = static_cast(attr.getValue()); - std::string slStr = (slVal == 1) ? "SLayout::RowMajor" : (slVal == 2) ? "SLayout::ColMajor" : "SLayout::NoneBox"; + slStr = (slVal == 1) ? "SLayout::RowMajor" : (slVal == 2) ? "SLayout::ColMajor" : "SLayout::NoneBox"; - int32_t frVal = 0; + frVal = 0; if (auto attr = dyn_cast(config.getSFractalSize())) frVal = attr.getInt(); int32_t padVal = 0; if (auto attr = dyn_cast(config.getPad())) padVal = static_cast(attr.getValue()); - std::string padStr = "PadValue::Null"; + padStr = "PadValue::Null"; switch (padVal) { case 1: padStr = "PadValue::Zero"; break; case 2: padStr = "PadValue::Max"; break; case 3: padStr = "PadValue::Min"; break; } - - if (!slStr.empty()) { - extraParams += ", " + slStr + ", " + std::to_string(frVal) + ", " + padStr; - } } // [核心修改] Valid Dims 处理逻辑 (支持混合静态/动态) @@ -3022,7 +3229,8 @@ struct PointerCastConversion : public OpConversionPattern { // 5. 生成 Tile 类型字符串 std::string tileTypeStr = std::string("Tile<") + roleTok + ", " + elemTypeStr + ", " + dimStr + ", " + - layoutParams + ", " + vrowTok + ", " + vcolTok + extraParams + ">"; + layoutParams + ", " + vrowTok + ", " + vcolTok + ", " + slStr + ", " + + std::to_string(frVal) + ", " + padStr + ">"; auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); Value resultValue; @@ -3070,6 +3278,9 @@ struct PointerCastConversion : public OpConversionPattern { rewriter.replaceOp(op, resultValue); return success(); } + +private: + PTOArch targetArch; }; //===----------------------------------------------------------------------===// @@ -3639,6 +3850,212 @@ struct PTORlsBufToEmitC : public OpConversionPattern { } }; +struct PTOInitializeL2G2LPipeToEmitC + : public OpConversionPattern { + PTOInitializeL2G2LPipeToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + + LogicalResult matchAndRewrite(mlir::pto::InitializeL2G2LPipeOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto pipeTy = dyn_cast(op.getPipe().getType()); + if (!pipeTy) + return rewriter.notifyMatchFailure(op, "expected !pto.pipe result type"); + + int8_t dirMask = static_cast(op.getDirMask()); + if (dirMask != 1 && dirMask != 2) + return rewriter.notifyMatchFailure(op, "unsupported dir_mask"); + + auto flagBase = allocateFlagBaseForInitOp(op.getOperation()); + if (failed(flagBase)) + return rewriter.notifyMatchFailure(op, "insufficient FlagID pairs"); + + auto pipeTypeToken = getTPipePipeTypeTokenForSplitInit(/*isL2G2L=*/true, dirMask); + if (failed(pipeTypeToken)) + return rewriter.notifyMatchFailure(op, "failed to map PipeType"); + + int slotNum = 8; + auto srcTok = + getEmitCTileTypeTokenFromType(pipeTy.getSrcTileType(), targetArch); + if (failed(srcTok)) + return rewriter.notifyMatchFailure( + op, "failed to map pipe src type to Tile<...> token"); + auto dstTok = + getEmitCTileTypeTokenFromType(pipeTy.getDstTileType(), targetArch); + if (failed(dstTok)) + return rewriter.notifyMatchFailure( + op, "failed to map pipe dst type to Tile<...> token"); + + std::optional localFifoDepth; + if (auto depthAttr = op.getLocalFifoDepthAttr()) + localFifoDepth = depthAttr.getInt(); + std::string tpipeTok = buildTPipeToken(*flagBase, *pipeTypeToken, slotNum, + *srcTok, *dstTok, localFifoDepth); + + Value gmAddr = peelUnrealized(adaptor.getGmAddr()); + Value localAddr = adaptor.getLocalAddr() ? peelUnrealized(adaptor.getLocalAddr()) : Value{}; + if (!localAddr) + return rewriter.notifyMatchFailure( + op, "local_addr must be present at EmitC lowering " + "(plan memory pass should have filled it in)"); + + auto emitPipeTy = + cast(getTypeConverter()->convertType(op.getPipe().getType())); + rewriter.replaceOpWithNewOp( + op, TypeRange{emitPipeTy}, tpipeTok, + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{gmAddr, localAddr}); + return success(); + } + + PTOArch targetArch; +}; + +struct PTOInitializeL2LPipeToEmitC + : public OpConversionPattern { + PTOInitializeL2LPipeToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + + LogicalResult matchAndRewrite(mlir::pto::InitializeL2LPipeOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (targetArch == PTOArch::A3) + return rewriter.notifyMatchFailure(op, "A3 architecture does not support initialize_l2l_pipe"); + + auto pipeTy = dyn_cast(op.getPipe().getType()); + if (!pipeTy) + return rewriter.notifyMatchFailure(op, "expected !pto.pipe result type"); + + int8_t dirMask = static_cast(op.getDirMask()); + if (dirMask != 1 && dirMask != 2) + return rewriter.notifyMatchFailure(op, "unsupported dir_mask"); + + auto flagBase = allocateFlagBaseForInitOp(op.getOperation()); + if (failed(flagBase)) + return rewriter.notifyMatchFailure(op, "insufficient FlagID pairs"); + + auto pipeTypeToken = getTPipePipeTypeTokenForSplitInit(/*isL2G2L=*/false, dirMask); + if (failed(pipeTypeToken)) + return rewriter.notifyMatchFailure(op, "failed to map PipeType"); + + int slotNum = 8; + auto srcTok = + getEmitCTileTypeTokenFromType(pipeTy.getSrcTileType(), targetArch); + if (failed(srcTok)) + return rewriter.notifyMatchFailure( + op, "failed to map pipe src type to Tile<...> token"); + auto dstTok = + getEmitCTileTypeTokenFromType(pipeTy.getDstTileType(), targetArch); + if (failed(dstTok)) + return rewriter.notifyMatchFailure( + op, "failed to map pipe dst type to Tile<...> token"); + std::string tpipeTok = + buildTPipeToken(*flagBase, *pipeTypeToken, slotNum, *srcTok, *dstTok); + + auto localAddrs = adaptor.getLocalAddrs(); + Value localAddr = !localAddrs.empty() ? peelUnrealized(localAddrs.front()) : Value{}; + if (!localAddr) + return rewriter.notifyMatchFailure( + op, "local_addr must be present at EmitC lowering " + "(plan memory pass should have filled it in)"); + + auto emitPipeTy = + cast(getTypeConverter()->convertType(op.getPipe().getType())); + rewriter.replaceOpWithNewOp( + op, TypeRange{emitPipeTy}, tpipeTok, + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{localAddr}); + return success(); + } + + PTOArch targetArch; +}; + +struct PTOTPushToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::TPushOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "TPUSH", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{ + peelUnrealized(adaptor.getTile()), + peelUnrealized(adaptor.getPipeHandle()), + }); + return success(); + } +}; + +struct PTODeclareTileToEmitC + : public OpConversionPattern { + PTODeclareTileToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + + LogicalResult matchAndRewrite(mlir::pto::DeclareTileOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + auto tileTok = getEmitCTileTypeTokenFromType(op.getTile().getType(), targetArch); + if (failed(tileTok)) + return rewriter.notifyMatchFailure(op, "failed to map declared tile type to Tile<...> token"); + + auto tileTy = emitc::OpaqueType::get(rewriter.getContext(), *tileTok); + auto tile = rewriter + .create( + op.getLoc(), tileTy, emitc::OpaqueAttr::get(rewriter.getContext(), "")) + .getResult(); + rewriter.replaceOp(op, tile); + return success(); + } + + PTOArch targetArch; +}; + +struct PTOTPopInternalToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::TPopInternalOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "TPOP", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{ + peelUnrealized(adaptor.getTile()), + peelUnrealized(adaptor.getPipeHandle()), + }); + return success(); + } +}; + +struct PTOTFreeInternalToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::TFreeInternalOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "TFREE", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{ + peelUnrealized(adaptor.getPipeHandle()), + }); + return success(); + } +}; + struct PTOSyncSetToEmitC : public OpConversionPattern { PTOSyncSetToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, PTOArch targetArch) @@ -3932,7 +4349,10 @@ struct PTOTAddToTADD : public OpConversionPattern { // populate patterns //===----------------------------------------------------------------------=== struct ReinterpretCastToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; + ReinterpretCastToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} LogicalResult matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { @@ -4025,7 +4445,11 @@ struct ReinterpretCastToEmitC : public OpConversionPattern"; + ", " + defaults.blTok + ", " + std::to_string(rows) + ", " + + std::to_string(cols) + ", " + defaults.slTok + ", " + + std::to_string(defaults.fractalBytes) + ", " + defaults.padTok + ">"; auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); Value tile = rewriter @@ -4102,6 +4527,9 @@ struct ReinterpretCastToEmitC : public OpConversionPattern TADDC(dst, src0, src1, src2) @@ -7186,6 +7614,12 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); + patterns.add(typeConverter, ctx, targetArch); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); @@ -7248,7 +7682,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); @@ -7319,7 +7753,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); @@ -7358,6 +7792,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, populateCallOpTypeConversionPattern(patterns, typeConverter); } +//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Pass //===----------------------------------------------------------------------===// diff --git a/lib/PTO/Transforms/PTOVerifyTFreePass.cpp b/lib/PTO/Transforms/PTOVerifyTFreePass.cpp new file mode 100644 index 00000000..fbb76fa5 --- /dev/null +++ b/lib/PTO/Transforms/PTOVerifyTFreePass.cpp @@ -0,0 +1,145 @@ +#include "PTO/Transforms/Passes.h" +#include "PTO/IR/PTO.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace pto { +namespace func = ::mlir::func; +#define GEN_PASS_DEF_PTOVERIFYTFREE +#include "PTO/Transforms/Passes.h.inc" +} // namespace pto +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +namespace { + +/// Find a matching lowered tfree in the same block. +static TFreeInternalOp findMatchingTFree(TPopInternalOp tpopOp) { + Value pipeHandle = tpopOp.getPipeHandle(); + Block *block = tpopOp->getBlock(); + for (auto it = std::next(tpopOp->getIterator()), end = block->end(); + it != end; ++it) { + if (auto tfreeOp = dyn_cast(&*it)) { + if (tfreeOp.getPipeHandle() == pipeHandle) + return tfreeOp; + } + } + return {}; +} + +static Operation *getTopLevelAncestorInBlock(Operation *op, Block *block) { + Operation *current = op; + while (current && current->getBlock() != block) { + Region *parentRegion = current->getParentRegion(); + if (!parentRegion) + return nullptr; + current = parentRegion->getParentOp(); + } + return current; +} + +static bool hasSamePipeTPopInRegion(Operation *op, Value pipeHandle, + TPopInternalOp current) { + bool found = false; + op->walk([&](TPopInternalOp nestedTpop) { + if (nestedTpop == current) + return WalkResult::advance(); + if (nestedTpop.getPipeHandle() == pipeHandle) { + found = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + return found; +} + +static LogicalResult verifySingleOutstandingUntil(TPopInternalOp tpopOp, + Operation *freeBoundary) { + if (!freeBoundary) + return success(); + if (freeBoundary == tpopOp.getOperation()) + return success(); + + Value pipeHandle = tpopOp.getPipeHandle(); + Block *block = tpopOp->getBlock(); + for (auto it = std::next(tpopOp->getIterator()), end = block->end(); + it != end; ++it) { + Operation *op = &*it; + if (hasSamePipeTPopInRegion(op, pipeHandle, tpopOp)) { + return tpopOp.emitOpError( + "multiple outstanding pops on the same pipe are not supported"); + } + if (op == freeBoundary) + break; + } + + return success(); +} + +static LogicalResult verifyNoTileUsesAfterTFree(TPopInternalOp tpopOp, + TFreeInternalOp tfreeOp) { + Value tile = tpopOp.getTile(); + Block *block = tpopOp->getBlock(); + + for (OpOperand &use : tile.getUses()) { + Operation *topLevelOwner = + getTopLevelAncestorInBlock(use.getOwner(), block); + if (!topLevelOwner) { + return tpopOp.emitOpError( + "borrowed tile uses must stay in the same parent block as the producing tpop"); + } + if (tfreeOp->isBeforeInBlock(topLevelOwner)) { + return tpopOp.emitOpError( + "tfree_internal must appear after the last use of the borrowed tile"); + } + } + + return success(); +} + +struct PTOVerifyTFreePass + : public mlir::pto::impl::PTOVerifyTFreeBase { + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + + // Collect tpop ops first to avoid iterator invalidation. + SmallVector tpops; + funcOp.walk([&](TPopInternalOp op) { tpops.push_back(op); }); + + for (TPopInternalOp tpopOp : tpops) { + // Must be inside a section. + if (!tpopOp->getParentOfType() && + !tpopOp->getParentOfType()) + continue; + + TFreeInternalOp existingTFree = findMatchingTFree(tpopOp); + if (!existingTFree) { + tpopOp.emitOpError("requires an explicit matching tfree_internal"); + signalPassFailure(); + return; + } + + // Reject cases where the same pipe is popped again before this borrowed + // tile reaches its explicit free boundary. + if (failed( + verifySingleOutstandingUntil(tpopOp, existingTFree.getOperation()))) { + signalPassFailure(); + return; + } + + if (failed(verifyNoTileUsesAfterTFree(tpopOp, existingTFree))) { + signalPassFailure(); + return; + } + } + } +}; + +} // namespace + +std::unique_ptr mlir::pto::createPTOVerifyTFreePass() { + return std::make_unique(); +} diff --git a/test/basic/get_fifo_tile_verify.mlir b/test/basic/get_fifo_tile_verify.mlir new file mode 100644 index 00000000..8397ec1d --- /dev/null +++ b/test/basic/get_fifo_tile_verify.mlir @@ -0,0 +1,25 @@ +// RUN: not ptoas %s 2>&1 | FileCheck %s + +module { + func.func @verify_get_fifo_tile_pipe_mismatch( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %local_addr0: i32, + %local_addr1: i32) { + %pipe0 = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, %local_addr0 : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + %pipe1 = pto.initialize_l2l_pipe {dir_mask = 1} + (%local_addr1 : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %slot_id = pto.tpop(%pipe0 : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %tile = pto.get_fifo_tile(%pipe1, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tfree(%pipe0, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + return + } +} + +// CHECK: error: 'pto.get_fifo_tile' op pipe_handle must match the pto.tpop that produced slot_id diff --git a/test/basic/initialize_pipe_verify_gm_missing.mlir b/test/basic/initialize_pipe_verify_gm_missing.mlir new file mode 100644 index 00000000..d12086a2 --- /dev/null +++ b/test/basic/initialize_pipe_verify_gm_missing.mlir @@ -0,0 +1,11 @@ +// RUN: not ptoas %s 2>&1 | FileCheck %s + +module { + func.func @verify_gm_missing_fifo_addr() { + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + return + } +} + +// CHECK: error: expected '(' diff --git a/test/basic/initialize_pipe_verify_gm_wrongtype.mlir b/test/basic/initialize_pipe_verify_gm_wrongtype.mlir new file mode 100644 index 00000000..45e3a6ac --- /dev/null +++ b/test/basic/initialize_pipe_verify_gm_wrongtype.mlir @@ -0,0 +1,12 @@ +// RUN: not ptoas %s 2>&1 | FileCheck %s + +module { + func.func @verify_gm_wrong_fifo_type(%addr: i32) { + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%addr : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + return + } +} + +// CHECK: error: 'pto.initialize_l2g2l_pipe' op gm_addr must be memref with #pto.address_space diff --git a/test/basic/initialize_pipe_verify_local_wrongtype.mlir b/test/basic/initialize_pipe_verify_local_wrongtype.mlir new file mode 100644 index 00000000..b936633f --- /dev/null +++ b/test/basic/initialize_pipe_verify_local_wrongtype.mlir @@ -0,0 +1,13 @@ +// RUN: not ptoas %s 2>&1 | FileCheck %s + +module { + func.func @verify_local_wrong_fifo_type( + %buf: memref<64x128xf32, #pto.address_space>) { + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%buf : memref<64x128xf32, #pto.address_space>) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + return + } +} + +// CHECK: error: 'pto.initialize_l2l_pipe' op local_addr must be i32 when provided diff --git a/test/basic/tfree_after_if.mlir b/test/basic/tfree_after_if.mlir new file mode 100644 index 00000000..21b18736 --- /dev/null +++ b/test/basic/tfree_after_if.mlir @@ -0,0 +1,49 @@ +// RUN: ptoas --pto-arch=a5 %s | FileCheck %s +// +// Explicit tfree can be written after structured control flow that consumes the +// borrowed tile. + +module { + func.func @explicit_tfree_after_if( + %cond: i1, + %c2v_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %vec_mem2 = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile2 = pto.bind_tile %vec_mem2, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + + scf.if %cond { + pto.tabs ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile2 : memref<32x128xf32, #pto.address_space>) + } else { + pto.tneg ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile2 : memref<32x128xf32, #pto.address_space>) + } + + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + + return + } +} + +// CHECK: TPOP( +// CHECK: if ( +// CHECK: TABS( +// CHECK: TNEG( +// CHECK: TFREE( diff --git a/test/basic/tfree_emitc_a2a3.mlir b/test/basic/tfree_emitc_a2a3.mlir new file mode 100644 index 00000000..aeea0a8d --- /dev/null +++ b/test/basic/tfree_emitc_a2a3.mlir @@ -0,0 +1,34 @@ +// RUN: ptoas %s | FileCheck %s + +module { + func.func @tfree_a2a3_l2g2l( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %local_fifo_addr: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, %local_fifo_addr : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + + return + } +} + +// CHECK: TPipe<{{.*}}Tile> + %cube_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %cube_tile = pto.bind_tile %cube_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe_c2v = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + %pipe_v2c = pto.initialize_l2l_pipe {dir_mask = 2} + (%v2c_consumer_buf : i32) + -> !pto.pipe>, memref<64x128xf32, #pto.address_space>> + + // C2V: acc pushes, vector pops + frees + pto.section.cube { + pto.tpush(%acc_tile, %pipe_c2v : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + pto.section.vector { + %slot_id_c2v = pto.tpop(%pipe_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %vec_fifo_tile = pto.get_fifo_tile(%pipe_c2v, %slot_id_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%vec_fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe_c2v, %slot_id_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + + // V2C: vector pushes, cube pops + frees + pto.section.vector { + pto.tpush(%vec_tile, %pipe_v2c : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + pto.section.cube { + %slot_id_v2c = pto.tpop(%pipe_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>) -> index + %cube_fifo_tile = pto.get_fifo_tile(%pipe_v2c, %slot_id_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>, index) + -> memref<64x128xf32, #pto.address_space> + pto.tmov ins(%cube_fifo_tile : memref<64x128xf32, #pto.address_space>) outs(%cube_tile : memref<64x128xf32, #pto.address_space>) + pto.tfree(%pipe_v2c, %slot_id_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>, index) + } + + return + } +} + +// CHECK: TPipe<{{.*}}Tile +// CHECK: TPUSH( +// CHECK: TPOP( +// CHECK: TFREE( +// CHECK: TPUSH( +// CHECK: TPOP( +// CHECK: TFREE( diff --git a/test/basic/tfree_missing.mlir b/test/basic/tfree_missing.mlir new file mode 100644 index 00000000..7e78742d --- /dev/null +++ b/test/basic/tfree_missing.mlir @@ -0,0 +1,39 @@ +// RUN: not ptoas --pto-arch=a5 %s 2>&1 | FileCheck %s +// +// Frontend users must write an explicit pto.tfree for every borrowed slot. + +module { + func.func @missing_explicit_tfree( + %c2v_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + // A second vec tile for the abs destination + %vec_mem2 = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile2 = pto.bind_tile %vec_mem2, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + + pto.tabs ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile2 : memref<32x128xf32, #pto.address_space>) + } + + return + } +} + +// CHECK: error: +// CHECK: slot_id must have exactly one pto.tfree user diff --git a/test/basic/tfree_multiple_outstanding.mlir b/test/basic/tfree_multiple_outstanding.mlir new file mode 100644 index 00000000..3b169ae6 --- /dev/null +++ b/test/basic/tfree_multiple_outstanding.mlir @@ -0,0 +1,47 @@ +// RUN: not ptoas --pto-arch=a5 %s 2>&1 | FileCheck %s +// +// One pipe may not have two outstanding borrowed tiles at the same time, even +// when both slots eventually have explicit tfree. + +module { + func.func @multiple_outstanding_same_pipe( + %c2v_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem0 = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile0 = pto.bind_tile %vec_mem0, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %vec_mem1 = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile1 = pto.bind_tile %vec_mem1, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %slot_id0 = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile0 = pto.get_fifo_tile(%pipe, %slot_id0 : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + + %slot_id1 = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile1 = pto.get_fifo_tile(%pipe, %slot_id1 : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + + // The first borrowed tile stays live past the second pop. + pto.tabs ins(%fifo_tile0 : memref<32x128xf32, #pto.address_space>) outs(%vec_tile0 : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe, %slot_id0 : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + pto.tmov ins(%fifo_tile1 : memref<32x128xf32, #pto.address_space>) outs(%vec_tile1 : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe, %slot_id1 : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + + return + } +} + +// CHECK: error: +// CHECK: multiple outstanding pops on the same pipe are not supported diff --git a/test/basic/tfree_ops.mlir b/test/basic/tfree_ops.mlir new file mode 100644 index 00000000..21d0b621 --- /dev/null +++ b/test/basic/tfree_ops.mlir @@ -0,0 +1,33 @@ +// RUN: ptoas --pto-arch=a5 %s | FileCheck %s + +module { + func.func @tfree_basic( + %c2v_consumer_buf: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + + return + } +} + +// CHECK: __global__ AICORE void tfree_basic +// CHECK: TPOP( +// CHECK: TFREE( diff --git a/test/basic/tfree_use_after_free.mlir b/test/basic/tfree_use_after_free.mlir new file mode 100644 index 00000000..112b2780 --- /dev/null +++ b/test/basic/tfree_use_after_free.mlir @@ -0,0 +1,34 @@ +// RUN: not ptoas --pto-arch=a5 %s 2>&1 | FileCheck %s +// +// Explicit tfree must appear after the last use of the borrowed tile. + +module { + func.func @tfree_before_last_use( + %c2v_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem0 = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile0 = pto.bind_tile %vec_mem0, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + pto.tmov ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile0 : memref<32x128xf32, #pto.address_space>) + } + + return + } +} + +// CHECK: error: +// CHECK: tfree_internal must appear after the last use of the borrowed tile diff --git a/test/basic/tfree_verify.mlir b/test/basic/tfree_verify.mlir new file mode 100644 index 00000000..db9fca7e --- /dev/null +++ b/test/basic/tfree_verify.mlir @@ -0,0 +1,17 @@ +// RUN: not ptoas %s 2>&1 | FileCheck %s + +module { + func.func @tfree_outside_section( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + // tfree outside section — should fail + pto.tfree(%pipe, %c0 : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + return + } +} + +// CHECK: error: 'pto.tfree' op must be inside a section.cube or section.vector diff --git a/test/basic/tpush_tpop_fifo_emitc.mlir b/test/basic/tpush_tpop_fifo_emitc.mlir new file mode 100644 index 00000000..00f67329 --- /dev/null +++ b/test/basic/tpush_tpop_fifo_emitc.mlir @@ -0,0 +1,48 @@ +// RUN: ptoas %s | FileCheck %s + +module { + func.func @pipe_emitc( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %local_fifo_addr: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %acc_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1, local_fifo_depth = 4} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, %local_fifo_addr : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + return + } +} + +// CHECK: #include "pto/pto-inst.hpp" +// CHECK-NOT: memref< +// CHECK: __global__ AICORE void pipe_emitc +// CHECK: auto {{.*}} = TPipe<0, FIFOType::GM_FIFO +// CHECK-SAME: Tile +// CHECK-SAME: Tile> + %cube_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %cube_tile = pto.bind_tile %cube_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe_c2v = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + %pipe_v2c = pto.initialize_l2l_pipe {dir_mask = 2} + (%v2c_consumer_buf : i32) + -> !pto.pipe>, memref<64x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe_c2v : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + %slot_id_v2c = pto.tpop(%pipe_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>) -> index + %cube_fifo_tile = pto.get_fifo_tile(%pipe_v2c, %slot_id_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>, index) + -> memref<64x128xf32, #pto.address_space> + pto.tmov ins(%cube_fifo_tile : memref<64x128xf32, #pto.address_space>) outs(%cube_tile : memref<64x128xf32, #pto.address_space>) + pto.tfree(%pipe_v2c, %slot_id_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>, index) + } + + pto.section.vector { + %slot_id_c2v = pto.tpop(%pipe_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %vec_fifo_tile = pto.get_fifo_tile(%pipe_c2v, %slot_id_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%vec_fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe_c2v, %slot_id_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + pto.tpush(%vec_tile, %pipe_v2c : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + return + } +} + +// CHECK: __global__ AICORE void pipe_emitc_a5_dirmask +// CHECK: auto {{.*}} = TPipe<0, FIFOType::VEC_FIFO +// CHECK-SAME: Tile +// CHECK: auto {{.*}} = TPipe<2, FIFOType::MAT_FIFO +// CHECK-SAME: Tile +// CHECK-DAG: TPUSH( +// CHECK-DAG: TPOP( +// CHECK-DAG: TFREE( diff --git a/test/basic/tpush_tpop_fifo_ops.mlir b/test/basic/tpush_tpop_fifo_ops.mlir new file mode 100644 index 00000000..42b55429 --- /dev/null +++ b/test/basic/tpush_tpop_fifo_ops.mlir @@ -0,0 +1,46 @@ +// RUN: ptoas %s | FileCheck %s + +module { + func.func @pipe_ops( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %local_fifo_addr: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %acc_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, %local_fifo_addr : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + pto.section.vector { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + -> memref<32x128xf32, #pto.address_space> + pto.tmov ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile : memref<32x128xf32, #pto.address_space>) + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<32x128xf32, #pto.address_space>>, index) + } + + return + } +} + +// CHECK: __global__ AICORE void pipe_ops +// CHECK: TPipe< +// CHECK: Tile&1 | FileCheck %s + +module { + func.func @verify_bidirectional_dir_mask_not_supported( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %acc_tile: memref<64x128xf32, #pto.address_space>) { + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 3} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>) + -> !pto.pipe>, memref<64x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + return + } +} + +// CHECK: error: 'pto.initialize_l2g2l_pipe' op dir_mask must be 1 (C2V) or 2 (V2C) diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/README.md b/test/tmp_tpush_tpop_matmul_pipe_a5/README.md new file mode 100644 index 00000000..6d86fc6b --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/README.md @@ -0,0 +1,47 @@ +# A5 临时样例: TMATMUL + TPUSH/TPOP(UB) + TPRINT + +这个目录是临时验证用例,可直接删除。 + +## 目录说明 + +- `kernel.mlir` + - PTOAS 源 IR,表达目标链路: + - Cube: `tload(gm->mat)` -> `tmov(mat->left/right)` -> `tmatmul` -> `tpush(acc, pipe)` + - Vector: `tpop(vec, pipe)` -> `tprint` +- `kernel_a5_manual.cpp` + - 可运行的 A5 kernel C++(保留 `TPRINT`) +- `main.cpp` + - Host 侧 ACL 启动代码,构造输入并拉起 kernel + - 输入设为 `A=单位阵`、`B=全1矩阵`,便于按 `TPRINT` 结果快速验算 + +## 1. 用 PTOAS 检查 MLIR + +在仓库根目录执行: + +```bash +./build/tools/ptoas/ptoas --pto-arch=a5 test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir > /tmp/tpush_tpop_matmul_pipe.out +``` + +可提取 emitc 末尾 C++: + +```bash +awk 'BEGIN{emit=0} /^#include "pto\/pto-inst.hpp"/{emit=1} emit{print}' /tmp/tpush_tpop_matmul_pipe.out > /tmp/tpush_tpop_matmul_pipe_emitc.cpp +``` + +说明:当前流水线中,`kernel.mlir` 里的 `pto.tprint` 在部分 pass 组合下可能被提前折叠。 +所以本目录提供了 `kernel_a5_manual.cpp` 来保证运行路径上确实有 `TPRINT`。 + +## 2. 运行方式(参考 pto-isa A5 st 框架) + +将 `kernel_a5_manual.cpp` + `main.cpp` 按你现有 A5 st 工程的 CMake 方式接入并编译。 + +运行后,Vector 段执行 `TPRINT(vecOut)`,日志里应可看到打印输出。 +本样例里理论结果应是全 `1.0`(`A * B = I * Ones = Ones`)。 + +## 3. 参数约定 + +- Tile 形状:`16x16` +- 输入类型:`f32` +- 输出累加类型:`f32` +- `pipe`:`dir_mask = 1`,C2V,A5 下映射 UB `VEC_FIFO` +- `main.cpp` 里 `c2vBuf = 0x10000` 为示例 UB 基址 diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir new file mode 100644 index 00000000..e478bba6 --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir @@ -0,0 +1,64 @@ +module { + func.func @matmul_tpush_tpop_print( + %gm_a: memref<16x16xf32, #pto.address_space>, + %gm_b: memref<16x16xf32, #pto.address_space>, + %c2v_consumer_buf: i32) { + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + + %mat_a_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %mat_b_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %left_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %right_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %acc_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<8x16xf32, #pto.address_space> + + // TileBuf-only tprint path (currently may be folded out by some lowering passes). + %vec_print = pto.alloc_tile : !pto.tile_buf + + %mat_a_tile = pto.bind_tile %mat_a_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %mat_b_tile = pto.bind_tile %mat_b_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %left_tile = pto.bind_tile %left_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %right_tile = pto.bind_tile %right_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %acc_tile = pto.bind_tile %acc_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c8, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<8x16xf32, #pto.address_space> -> memref<8x16xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, location = #pto.pipe_location} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<8x16xf32, #pto.address_space>> + + pto.section.cube { + pto.tload ins(%gm_a : memref<16x16xf32, #pto.address_space>) outs(%mat_a_tile : memref<16x16xf32, #pto.address_space>) + pto.tload ins(%gm_b : memref<16x16xf32, #pto.address_space>) outs(%mat_b_tile : memref<16x16xf32, #pto.address_space>) + + pto.tmov ins(%mat_a_tile : memref<16x16xf32, #pto.address_space>) outs(%left_tile : memref<16x16xf32, #pto.address_space>) + pto.tmov ins(%mat_b_tile : memref<16x16xf32, #pto.address_space>) outs(%right_tile : memref<16x16xf32, #pto.address_space>) + + pto.tmatmul ins(%left_tile, %right_tile : memref<16x16xf32, #pto.address_space>, memref<16x16xf32, #pto.address_space>) outs(%acc_tile : memref<16x16xf32, #pto.address_space>) + + pto.tpush(%acc_tile, %pipe : memref<16x16xf32, #pto.address_space>, !pto.pipe>, memref<8x16xf32, #pto.address_space>>) + } + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<8x16xf32, #pto.address_space>, !pto.pipe>, memref<8x16xf32, #pto.address_space>>) + pto.tmov ins(%vec_tile : memref<8x16xf32, #pto.address_space>) outs(%vec_print : !pto.tile_buf) + pto.tprint ins(%vec_print : !pto.tile_buf) + } + + return + } +} diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp new file mode 100644 index 00000000..3e9cf987 --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp @@ -0,0 +1,78 @@ +#include "pto/pto-inst.hpp" + +using namespace pto; + +__global__ AICORE void matmul_tpush_tpop_print(__gm__ float *gm_a, __gm__ float *gm_b, + __gm__ float *gm_slot_buffer, + int32_t c2v_consumer_buf, + int32_t v2c_consumer_buf) +{ + (void)gm_slot_buffer; + (void)v2c_consumer_buf; + + int64_t base0 = 0; + int64_t base512 = 512; + + Tile matA; + TASSIGN(matA, base0); + Tile matB; + TASSIGN(matB, base512); + + Tile leftA; + TASSIGN(leftA, base0); + Tile rightB; + TASSIGN(rightB, base0); + + Tile accC; + TASSIGN(accC, base0); + Tile vecOut; + TASSIGN(vecOut, base0); + + auto pipe = TPipe<0, FIFOType::VEC_FIFO, 8, 8, + Tile, + Tile, + TSyncOpType::TMOV_C2UB, TSyncOpType::NONE, VecCubeRatio::V2C1_VECS>(c2v_consumer_buf); + +#if defined(__DAV_CUBE__) + using GTShape = pto::Shape<1, 1, 1, 16, 16>; + using GTStride = pto::Stride<256, 256, 256, 16, 1>; + using GlobalFloat = GlobalTensor; + + GTShape shape = GTShape(); + GTStride stride = GTStride(); + + GlobalFloat gA(gm_a, shape, stride); + GlobalFloat gB(gm_b, shape, stride); + + TLOAD(matA, gA); + TLOAD(matB, gB); + TMOV(leftA, matA); + TMOV(rightB, matB); + TMATMUL(accC, leftA, rightB); + TPUSH(accC, pipe); +#endif + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + TPOP(vecOut, pipe); + TPRINT(vecOut); + TFREE(pipe); +#endif +} + +template +void LaunchMatmulTPushPopPrint(uint8_t *a, uint8_t *b, uint8_t *slot, + int32_t c2vBuf, int32_t v2cBuf, void *stream) +{ + (void)tilingKey; + matmul_tpush_tpop_print<<<1, nullptr, stream>>>(reinterpret_cast(a), + reinterpret_cast(b), + reinterpret_cast(slot), + c2vBuf, v2cBuf); +} + +template void LaunchMatmulTPushPopPrint<1>(uint8_t *a, uint8_t *b, uint8_t *slot, + int32_t c2vBuf, int32_t v2cBuf, void *stream); diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp b/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp new file mode 100644 index 00000000..353bfabf --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp @@ -0,0 +1,70 @@ +#include "acl/acl.h" +#include +#include +#include +#include + +template +void LaunchMatmulTPushPopPrint(uint8_t *a, uint8_t *b, uint8_t *slot, + int32_t c2vBuf, int32_t v2cBuf, void *stream); + +#define ACL_CHECK(expr) \ + do { \ + aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ACL ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ + return 1; \ + } \ + } while (0) + +int main() +{ + constexpr int M = 16; + constexpr int K = 16; + constexpr int N = 16; + constexpr size_t aBytes = M * K * sizeof(float); + constexpr size_t bBytes = K * N * sizeof(float); + constexpr size_t slotBytes = M * N * sizeof(float); + + std::vector hostA(M * K, 0.0f); + std::vector hostB(K * N, 1.0f); + std::vector hostSlot(M * N, 0.0f); + for (int i = 0; i < M; ++i) { + hostA[i * K + i] = 1.0f; // A = I + } + + ACL_CHECK(aclInit(nullptr)); + ACL_CHECK(aclrtSetDevice(0)); + + aclrtStream stream = nullptr; + ACL_CHECK(aclrtCreateStream(&stream)); + + uint8_t *devA = nullptr; + uint8_t *devB = nullptr; + uint8_t *devSlot = nullptr; + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devA), aBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devB), bBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devSlot), slotBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(devA, aBytes, hostA.data(), aBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devB, bBytes, hostB.data(), bBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devSlot, slotBytes, hostSlot.data(), slotBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + + // For A5 C2V VEC_FIFO path, c2vBuf is local UB base address (example value). + constexpr int32_t c2vBuf = 0x10000; + constexpr int32_t v2cBuf = 0; + + LaunchMatmulTPushPopPrint<1>(devA, devB, devSlot, c2vBuf, v2cBuf, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + std::puts("Kernel finished. Expect TPRINT output to be all 1.0 (A=I, B=all-ones)."); + + aclrtFree(devA); + aclrtFree(devB); + aclrtFree(devSlot); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); + + return 0; +} diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index 480c763d..f32ee107 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -638,6 +638,8 @@ int main(int argc, char **argv) { // pm.addNestedPass(pto::createPTOInsertCVMovPass()); // pm.addNestedPass(pto::createPTOConvertToDPSPass()); // pm.addNestedPass(pto::createPTOInsertLoadStoreForMixCVPass()); + pm.addNestedPass(pto::createPTOLowerTPopPass()); + pm.addNestedPass(pto::createPTOVerifyTFreePass()); pm.addNestedPass(pto::createLoweringSyncToPipePass()); if (!disableInferLayout) @@ -654,6 +656,10 @@ int main(int argc, char **argv) { pm.addPass(pto::createPlanMemoryPass(planMemoryOption)); } + std::string arch = ptoTargetArch; + for (char &c : arch) + c = static_cast(std::tolower(static_cast(c))); + // Conditionally add Sync pass based on flag if (enableInsertSync) { if (effectiveLevel == PTOBuildLevel::Level3) { @@ -669,9 +675,7 @@ int main(int argc, char **argv) { // pm.addNestedPass(pto::createPTOVFloopGatherPass()); pm.addPass(createCSEPass()); - std::string arch = ptoTargetArch; - for (char &c : arch) - c = static_cast(std::tolower(static_cast(c))); + if (arch == "a3") { pm.addPass(pto::createEmitPTOManualPass(pto::PTOArch::A3)); } else if (arch == "a5") { From 5ade25f23f0458ad2eb5a05587e126e9875d36a1 Mon Sep 17 00:00:00 2001 From: qukelin Date: Sat, 14 Mar 2026 14:38:35 +0800 Subject: [PATCH 2/4] pto: derive tpush tpop pipeline from pipe info --- include/PTO/IR/PTOOps.td | 41 ++++++++++++++++--- lib/PTO/IR/PTO.cpp | 31 +++++--------- lib/PTO/Transforms/PTOLowerTPopPass.cpp | 19 +-------- .../tpop_internal_pipeline_from_pipe.mlir | 33 +++++++++++++++ .../tpush_tpop_fifo_emitc_a5_dirmask.mlir | 6 +++ 5 files changed, 87 insertions(+), 43 deletions(-) create mode 100644 test/basic/tpop_internal_pipeline_from_pipe.mlir diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index 328d5a83..3e777367 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -3733,7 +3733,34 @@ def TPushOp : PTO_TOp<"tpush", [ }]; let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } + ::mlir::pto::PIPE getPipe() { + auto isAddressSpace = [](Type ty, ::mlir::pto::AddressSpace target) -> bool { + if (auto tb = ::mlir::dyn_cast<::mlir::pto::TileBufType>(ty)) { + if (auto as = ::mlir::dyn_cast_or_null<::mlir::pto::AddressSpaceAttr>( + tb.getMemorySpace())) + return as.getAddressSpace() == target; + return false; + } + if (auto mr = ::mlir::dyn_cast<::mlir::MemRefType>(ty)) { + if (auto ms = mr.getMemorySpace()) { + if (auto as = ::mlir::dyn_cast<::mlir::pto::AddressSpaceAttr>(ms)) + return as.getAddressSpace() == target; + } + return false; + } + return false; + }; + + auto pipeTy = ::mlir::dyn_cast<::mlir::pto::PipeType>(getPipeHandle().getType()); + if (!pipeTy) + return ::mlir::pto::PIPE::PIPE_UNASSIGNED; + + if (isAddressSpace(pipeTy.getSrcTileType(), ::mlir::pto::AddressSpace::ACC)) + return ::mlir::pto::PIPE::PIPE_FIX; + if (isAddressSpace(pipeTy.getSrcTileType(), ::mlir::pto::AddressSpace::VEC)) + return ::mlir::pto::PIPE::PIPE_MTE3; + return ::mlir::pto::PIPE::PIPE_UNASSIGNED; + } }]; } @@ -3821,8 +3848,7 @@ def TPopInternalOp : PTO_TOp<"tpop_internal", [ let arguments = (ins PTODpsType:$tile, - PipeType:$pipe_handle, - PTO_PipeAttr:$assigned_pipe + PipeType:$pipe_handle ); let results = (outs); @@ -3830,12 +3856,17 @@ def TPopInternalOp : PTO_TOp<"tpop_internal", [ let assemblyFormat = [{ `(` $tile `,` $pipe_handle `:` qualified(type($tile)) `,` qualified(type($pipe_handle)) `)` - `{` `assigned_pipe` `=` $assigned_pipe `}` attr-dict }]; let extraClassDeclaration = [{ - ::mlir::pto::PIPE getPipe() { return getAssignedPipe().getPipe(); } + ::mlir::pto::PIPE getPipe() { + if (getPipeHandle().getDefiningOp<::mlir::pto::InitializeL2G2LPipeOp>()) + return ::mlir::pto::PIPE::PIPE_MTE2; + if (getPipeHandle().getDefiningOp<::mlir::pto::InitializeL2LPipeOp>()) + return ::mlir::pto::PIPE::PIPE_S; + return ::mlir::pto::PIPE::PIPE_UNASSIGNED; + } ::mlir::MutableOperandRange getDpsInitsMutable() { return getTileMutable(); } }]; } diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 10415698..96c2f9c7 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -2092,16 +2092,13 @@ static LogicalResult verifyPipeTileType(Operation *op, Type pipeType, LogicalResult TPushOp::verify() { if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) return emitOpError("must be inside a section.cube or section.vector"); - return verifyPipeTileType(getOperation(), getPipeHandle().getType(), - getTile().getType(), /*isPush=*/true); -} - -static FailureOr getConsumerAssignedPipe(Value pipeHandle) { - if (pipeHandle.getDefiningOp()) - return pto::PIPE::PIPE_MTE2; - if (pipeHandle.getDefiningOp()) - return pto::PIPE::PIPE_S; - return failure(); + if (failed(verifyPipeTileType(getOperation(), getPipeHandle().getType(), + getTile().getType(), /*isPush=*/true))) + return failure(); + if (getPipe() == pto::PIPE::PIPE_UNASSIGNED) + return emitOpError( + "pipe_handle source tile type must map to a supported producer pipe"); + return success(); } LogicalResult TPopOp::verify() { @@ -2144,18 +2141,10 @@ LogicalResult TPopInternalOp::verify() { getTile().getType(), /*isPush=*/false))) return failure(); - pto::PIPE assignedPipe = getAssignedPipe().getPipe(); - if (assignedPipe == pto::PIPE::PIPE_ALL || - assignedPipe == pto::PIPE::PIPE_UNASSIGNED) { + pto::PIPE pipe = getPipe(); + if (pipe == pto::PIPE::PIPE_ALL || pipe == pto::PIPE::PIPE_UNASSIGNED) return emitOpError( - "assigned_pipe must be a concrete pipe, not PIPE_ALL/PIPE_UNASSIGNED"); - } - - auto expectedPipe = getConsumerAssignedPipe(getPipeHandle()); - if (failed(expectedPipe)) - return emitOpError("pipe_handle must be produced by a pipe initialization op"); - if (assignedPipe != *expectedPipe) - return emitOpError("assigned_pipe does not match the pipe_handle kind"); + "pipe_handle must be produced by a supported pipe initialization op"); return success(); } diff --git a/lib/PTO/Transforms/PTOLowerTPopPass.cpp b/lib/PTO/Transforms/PTOLowerTPopPass.cpp index c6fa9c2e..e242110f 100644 --- a/lib/PTO/Transforms/PTOLowerTPopPass.cpp +++ b/lib/PTO/Transforms/PTOLowerTPopPass.cpp @@ -18,14 +18,6 @@ using namespace mlir::pto; namespace { -static FailureOr getAssignedPipeForConsumer(Value pipeHandle) { - if (pipeHandle.getDefiningOp()) - return pto::PIPE::PIPE_MTE2; - if (pipeHandle.getDefiningOp()) - return pto::PIPE::PIPE_S; - return failure(); -} - static LogicalResult validateSlotUsers(func::FuncOp funcOp) { WalkResult walkResult = funcOp.walk([&](TPopOp op) { GetFifoTileOp getFifoTileOp; @@ -105,17 +97,10 @@ struct LowerGetFifoTilePattern : public OpRewritePattern { if (!tpopOp) return rewriter.notifyMatchFailure(op, "slot_id must come from pto.tpop"); - auto assignedPipe = getAssignedPipeForConsumer(op.getPipeHandle()); - if (failed(assignedPipe)) { - return rewriter.notifyMatchFailure( - op, "pipe_handle must be produced by a pipe initialization op"); - } - auto declaredTile = rewriter.create(op.getLoc(), op.getTile().getType()); - rewriter.create( - op.getLoc(), declaredTile.getTile(), op.getPipeHandle(), - PipeAttr::get(rewriter.getContext(), *assignedPipe)); + rewriter.create(op.getLoc(), declaredTile.getTile(), + op.getPipeHandle()); rewriter.replaceOp(op, declaredTile.getTile()); return success(); } diff --git a/test/basic/tpop_internal_pipeline_from_pipe.mlir b/test/basic/tpop_internal_pipeline_from_pipe.mlir new file mode 100644 index 00000000..6c53442b --- /dev/null +++ b/test/basic/tpop_internal_pipeline_from_pipe.mlir @@ -0,0 +1,33 @@ +// RUN: ptoas %s | FileCheck %s + +module { + func.func @tpop_internal_pipeline_from_pipe( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %local_fifo_addr: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_l2g2l_pipe {dir_mask = 1, local_fifo_depth = 4} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, %local_fifo_addr : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + %fifo_tile = pto.declare_tile -> memref<32x128xf32, #pto.address_space> + pto.tpop_internal(%fifo_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + pto.tmov ins(%fifo_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile : memref<32x128xf32, #pto.address_space>) + pto.tfree_internal(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + return + } +} + +// CHECK: __global__ AICORE void tpop_internal_pipeline_from_pipe +// CHECK: TPOP( +// CHECK: TMOV( +// CHECK: TFREE( diff --git a/test/basic/tpush_tpop_fifo_emitc_a5_dirmask.mlir b/test/basic/tpush_tpop_fifo_emitc_a5_dirmask.mlir index 35ed946e..9452a658 100644 --- a/test/basic/tpush_tpop_fifo_emitc_a5_dirmask.mlir +++ b/test/basic/tpush_tpop_fifo_emitc_a5_dirmask.mlir @@ -1,4 +1,5 @@ // RUN: ptoas --pto-arch=a5 %s | FileCheck %s +// RUN: ptoas --pto-arch=a5 --enable-insert-sync %s 2>&1 | FileCheck %s --check-prefix=SYNC module { func.func @pipe_emitc_a5_dirmask( @@ -59,3 +60,8 @@ module { // CHECK-DAG: TPUSH( // CHECK-DAG: TPOP( // CHECK-DAG: TFREE( + +// SYNC-NOT: assigned_pipe +// SYNC: pto.tpop_internal( +// SYNC: pto.set_flag[, , ] +// SYNC: pto.wait_flag[, , ] From b139202b09228c4f3876e826f7808ece270870e8 Mon Sep 17 00:00:00 2001 From: qukelin Date: Sat, 14 Mar 2026 15:11:46 +0800 Subject: [PATCH 3/4] test: update tmp a5 tpush tpop matmul case --- test/tmp_tpush_tpop_matmul_pipe_a5/README.md | 7 +- .../tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir | 13 +-- .../kernel_a5_manual.cpp | 7 +- test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp | 109 ++++++++++-------- 4 files changed, 71 insertions(+), 65 deletions(-) diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/README.md b/test/tmp_tpush_tpop_matmul_pipe_a5/README.md index 6d86fc6b..cc2e9638 100644 --- a/test/tmp_tpush_tpop_matmul_pipe_a5/README.md +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/README.md @@ -7,7 +7,7 @@ - `kernel.mlir` - PTOAS 源 IR,表达目标链路: - Cube: `tload(gm->mat)` -> `tmov(mat->left/right)` -> `tmatmul` -> `tpush(acc, pipe)` - - Vector: `tpop(vec, pipe)` -> `tprint` + - Vector: `tpop(pipe)` -> `get_fifo_tile(pipe, slot)` -> `tmov(fifo->print tile)` -> `tprint` - `kernel_a5_manual.cpp` - 可运行的 A5 kernel C++(保留 `TPRINT`) - `main.cpp` @@ -36,12 +36,13 @@ awk 'BEGIN{emit=0} /^#include "pto\/pto-inst.hpp"/{emit=1} emit{print}' /tmp/tpu 将 `kernel_a5_manual.cpp` + `main.cpp` 按你现有 A5 st 工程的 CMake 方式接入并编译。 运行后,Vector 段执行 `TPRINT(vecOut)`,日志里应可看到打印输出。 -本样例里理论结果应是全 `1.0`(`A * B = I * Ones = Ones`)。 +本样例里理论打印结果应是全 `1.0`(`A * B = I * Ones = Ones`)。 +由于 A5 C2V 示例里 consumer tile 为 `8x16` Vec tile,因此打印的是对应的 `8x16` 结果块。 ## 3. 参数约定 - Tile 形状:`16x16` - 输入类型:`f32` - 输出累加类型:`f32` -- `pipe`:`dir_mask = 1`,C2V,A5 下映射 UB `VEC_FIFO` +- `pipe`:`initialize_l2l_pipe {dir_mask = 1}`,C2V,A5 下映射 UB `VEC_FIFO` - `main.cpp` 里 `c2vBuf = 0x10000` 为示例 UB 基址 diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir index e478bba6..915bb8d0 100644 --- a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir @@ -11,7 +11,6 @@ module { %left_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> %right_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> %acc_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> - %vec_mem = memref.alloc() : memref<8x16xf32, #pto.address_space> // TileBuf-only tprint path (currently may be folded out by some lowering passes). %vec_print = pto.alloc_tile : !pto.tile_buf @@ -33,11 +32,8 @@ module { %acc_tile = pto.bind_tile %acc_mem, %c16, %c16 { config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> - %vec_tile = pto.bind_tile %vec_mem, %c8, %c16 { - config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> - } : memref<8x16xf32, #pto.address_space> -> memref<8x16xf32, #pto.address_space> - %pipe = pto.initialize_pipe {dir_mask = 1, location = #pto.pipe_location} + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} (%c2v_consumer_buf : i32) -> !pto.pipe>, memref<8x16xf32, #pto.address_space>> @@ -54,9 +50,12 @@ module { } pto.section.vector { - pto.tpop(%vec_tile, %pipe : memref<8x16xf32, #pto.address_space>, !pto.pipe>, memref<8x16xf32, #pto.address_space>>) - pto.tmov ins(%vec_tile : memref<8x16xf32, #pto.address_space>) outs(%vec_print : !pto.tile_buf) + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<8x16xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<8x16xf32, #pto.address_space>>, index) + -> memref<8x16xf32, #pto.address_space> + pto.tmov ins(%fifo_tile : memref<8x16xf32, #pto.address_space>) outs(%vec_print : !pto.tile_buf) pto.tprint ins(%vec_print : !pto.tile_buf) + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<8x16xf32, #pto.address_space>>, index) } return diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp index 3e9cf987..7e291847 100644 --- a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp @@ -25,15 +25,14 @@ __global__ AICORE void matmul_tpush_tpop_print(__gm__ float *gm_a, __gm__ float Tile accC; TASSIGN(accC, base0); - Tile vecOut; + Tile vecOut; TASSIGN(vecOut, base0); auto pipe = TPipe<0, FIFOType::VEC_FIFO, 8, 8, Tile, - Tile, - TSyncOpType::TMOV_C2UB, TSyncOpType::NONE, VecCubeRatio::V2C1_VECS>(c2v_consumer_buf); + Tile>(c2v_consumer_buf); #if defined(__DAV_CUBE__) using GTShape = pto::Shape<1, 1, 1, 16, 16>; diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp b/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp index 353bfabf..538b09eb 100644 --- a/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp @@ -4,67 +4,74 @@ #include #include -template -void LaunchMatmulTPushPopPrint(uint8_t *a, uint8_t *b, uint8_t *slot, - int32_t c2vBuf, int32_t v2cBuf, void *stream); +void LaunchMatmulTPushPopPrint(float *a, float *b, float *slot, int32_t c2vBuf, + void *stream); -#define ACL_CHECK(expr) \ - do { \ - aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ACL ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - return 1; \ - } \ - } while (0) +#define ACL_CHECK(expr) \ + do { \ + aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ACL ERROR] %s failed: %d (%s:%d)\n", #expr, \ + (int)_ret, __FILE__, __LINE__); \ + return 1; \ + } \ + } while (0) -int main() -{ - constexpr int M = 16; - constexpr int K = 16; - constexpr int N = 16; - constexpr size_t aBytes = M * K * sizeof(float); - constexpr size_t bBytes = K * N * sizeof(float); - constexpr size_t slotBytes = M * N * sizeof(float); +int main() { + constexpr int M = 16; + constexpr int K = 16; + constexpr int N = 16; + constexpr size_t aBytes = M * K * sizeof(float); + constexpr size_t bBytes = K * N * sizeof(float); + constexpr size_t slotBytes = M * N * sizeof(float); - std::vector hostA(M * K, 0.0f); - std::vector hostB(K * N, 1.0f); - std::vector hostSlot(M * N, 0.0f); - for (int i = 0; i < M; ++i) { - hostA[i * K + i] = 1.0f; // A = I - } + std::vector hostA(M * K, 0.0f); + std::vector hostB(K * N, 1.0f); + std::vector hostSlot(M * N, 0.0f); + for (int i = 0; i < M; ++i) { + hostA[i * K + i] = 1.0f; // A = I + } - ACL_CHECK(aclInit(nullptr)); - ACL_CHECK(aclrtSetDevice(0)); + ACL_CHECK(aclInit(nullptr)); + ACL_CHECK(aclrtSetDevice(0)); - aclrtStream stream = nullptr; - ACL_CHECK(aclrtCreateStream(&stream)); + aclrtStream stream = nullptr; + ACL_CHECK(aclrtCreateStream(&stream)); - uint8_t *devA = nullptr; - uint8_t *devB = nullptr; - uint8_t *devSlot = nullptr; - ACL_CHECK(aclrtMalloc(reinterpret_cast(&devA), aBytes, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc(reinterpret_cast(&devB), bBytes, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc(reinterpret_cast(&devSlot), slotBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + uint8_t *devA = nullptr; + uint8_t *devB = nullptr; + uint8_t *devSlot = nullptr; + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devA), aBytes, + ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devB), bBytes, + ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devSlot), slotBytes, + ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMemcpy(devA, aBytes, hostA.data(), aBytes, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(devB, bBytes, hostB.data(), bBytes, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(devSlot, slotBytes, hostSlot.data(), slotBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devA, aBytes, hostA.data(), aBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devB, bBytes, hostB.data(), bBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devSlot, slotBytes, hostSlot.data(), slotBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); - // For A5 C2V VEC_FIFO path, c2vBuf is local UB base address (example value). - constexpr int32_t c2vBuf = 0x10000; - constexpr int32_t v2cBuf = 0; + // For A5 C2V VEC_FIFO path, c2vBuf is local UB base address (example value). + constexpr int32_t c2vBuf = 0x10000; - LaunchMatmulTPushPopPrint<1>(devA, devB, devSlot, c2vBuf, v2cBuf, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); + LaunchMatmulTPushPopPrint( + reinterpret_cast(devA), reinterpret_cast(devB), + reinterpret_cast(devSlot), c2vBuf, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); - std::puts("Kernel finished. Expect TPRINT output to be all 1.0 (A=I, B=all-ones)."); + std::puts("Kernel finished. Expect TPRINT output for the 8x16 Vec tile to be " + "all 1.0 (A=I, B=all-ones)."); - aclrtFree(devA); - aclrtFree(devB); - aclrtFree(devSlot); - aclrtDestroyStream(stream); - aclrtResetDevice(0); - aclFinalize(); + aclrtFree(devA); + aclrtFree(devB); + aclrtFree(devSlot); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); - return 0; + return 0; } From 0a8e732c2f21975920a534c289829bb8c4188453 Mon Sep 17 00:00:00 2001 From: qukelin Date: Sat, 14 Mar 2026 15:44:31 +0800 Subject: [PATCH 4/4] test: add loop4 tpush tpop fifo case --- .../README.md | 58 +++++++++++++ .../kernel.mlir | 72 ++++++++++++++++ .../kernel_a5_manual.cpp | 78 ++++++++++++++++++ .../main.cpp | 82 +++++++++++++++++++ 4 files changed, 290 insertions(+) create mode 100644 test/tmp_tpush_tpop_matmul_pipe_loop4_a5/README.md create mode 100644 test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel.mlir create mode 100644 test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel_a5_manual.cpp create mode 100644 test/tmp_tpush_tpop_matmul_pipe_loop4_a5/main.cpp diff --git a/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/README.md b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/README.md new file mode 100644 index 00000000..75bb23c8 --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/README.md @@ -0,0 +1,58 @@ +# A5 临时样例: 4 次 TMATMUL + TPUSH/TPOP(UB) 顺序检查 + +这个目录是临时验证用例,可直接删除。 + +## 目标 + +- 复用 `initialize_l2l_pipe` 的 C2V 路径 +- Cube 侧循环 4 次执行 `tpush` +- Vector 侧循环 4 次执行 `tpop` +- 检查 `tpop` 看到的 4 个输出块是否按 FIFO 顺序分别为全 `1.0`、全 `2.0`、全 `3.0`、全 `4.0` + +## 数据构造 + +- `A = 16x16` 单位阵 +- `B_all = 4 x (16x16)` 的连续 GM 数据 +- 第 1/2/3/4 个 `16x16` 子块分别填满 `1.0 / 2.0 / 3.0 / 4.0` +- 因为 `A * B = I * B = B`,所以 `tpush` 入 FIFO 的结果块和 `B` 一致 + +## 目录说明 + +- `kernel.mlir` + - PTOAS 源 IR + - Cube: `tload(gm->mat)` -> `tmov(mat->left/right)` -> `tmatmul` -> `tpush(acc, pipe)`,循环 4 次 + - Vector: `tpop(pipe)` -> `get_fifo_tile(pipe, slot)` -> `tprint` -> `tfree`,循环 4 次 +- `kernel_a5_manual.cpp` + - 可运行的 A5 kernel C++,保留 `TPRINT` +- `main.cpp` + - Host 侧 ACL 启动代码,构造 `A` 和 4 段 `B` + +## 1. 用 PTOAS 检查 MLIR + +在仓库根目录执行: + +```bash +./build/tools/ptoas/ptoas --pto-arch=a5 test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel.mlir > /tmp/tpush_tpop_matmul_pipe_loop4.out +``` + +说明:和现有临时样例一致,`kernel.mlir` 里的 `pto.tprint` 在部分 pass 组合下可能被提前折叠。 +所以这个目录同时提供 `kernel_a5_manual.cpp`,用于保留运行路径上的 `TPRINT`,直接检查 4 次 `tpop` 输出。 + +## 2. 运行预期 + +运行 `kernel_a5_manual.cpp` + `main.cpp` 后,应看到 4 段 `TPRINT` 输出。 + +- 第 1 段:全 `1.0` +- 第 2 段:全 `2.0` +- 第 3 段:全 `3.0` +- 第 4 段:全 `4.0` + +由于 A5 C2V 示例里的 consumer tile 是 `8x16` Vec tile,所以每次打印的是对应结果块的 `8x16` 输出。 + +## 3. 参数约定 + +- Tile 形状:`16x16` +- 输入类型:`f32` +- 输出累加类型:`f32` +- `pipe`:`initialize_l2l_pipe {dir_mask = 1}`,C2V,A5 下映射 UB `VEC_FIFO` +- `main.cpp` 里 `c2vBuf = 0x10000` 为示例 UB 基址 diff --git a/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel.mlir b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel.mlir new file mode 100644 index 00000000..400cdf2a --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel.mlir @@ -0,0 +1,72 @@ +module { + func.func @matmul_tpush_tpop_loop4_print( + %gm_a: memref<16x16xf32, #pto.address_space>, + %gm_b_all: memref<64x16xf32, #pto.address_space>, + %c2v_consumer_buf: i32) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + + %mat_a_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %mat_b_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %left_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %right_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %acc_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + + %vec_print = pto.alloc_tile : !pto.tile_buf + + %mat_a_tile = pto.bind_tile %mat_a_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %mat_b_tile = pto.bind_tile %mat_b_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %left_tile = pto.bind_tile %left_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %right_tile = pto.bind_tile %right_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %acc_tile = pto.bind_tile %acc_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %pipe = pto.initialize_l2l_pipe {dir_mask = 1} + (%c2v_consumer_buf : i32) + -> !pto.pipe>, memref<8x16xf32, #pto.address_space>> + + pto.section.cube { + pto.tload ins(%gm_a : memref<16x16xf32, #pto.address_space>) outs(%mat_a_tile : memref<16x16xf32, #pto.address_space>) + pto.tmov ins(%mat_a_tile : memref<16x16xf32, #pto.address_space>) outs(%left_tile : memref<16x16xf32, #pto.address_space>) + + scf.for %i = %c0 to %c4 step %c1 { + %row_offset = arith.muli %i, %c16 : index + %gm_b_iter = memref.subview %gm_b_all[%row_offset, %c0] [16, 16] [1, 1] + : memref<64x16xf32, #pto.address_space> + to memref<16x16xf32, strided<[16, 1], offset: ?>, #pto.address_space> + + pto.tload ins(%gm_b_iter : memref<16x16xf32, strided<[16, 1], offset: ?>, #pto.address_space>) outs(%mat_b_tile : memref<16x16xf32, #pto.address_space>) + pto.tmov ins(%mat_b_tile : memref<16x16xf32, #pto.address_space>) outs(%right_tile : memref<16x16xf32, #pto.address_space>) + pto.tmatmul ins(%left_tile, %right_tile : memref<16x16xf32, #pto.address_space>, memref<16x16xf32, #pto.address_space>) outs(%acc_tile : memref<16x16xf32, #pto.address_space>) + pto.tpush(%acc_tile, %pipe : memref<16x16xf32, #pto.address_space>, !pto.pipe>, memref<8x16xf32, #pto.address_space>>) + } + } + + pto.section.vector { + scf.for %i = %c0 to %c4 step %c1 { + %slot_id = pto.tpop(%pipe : !pto.pipe>, memref<8x16xf32, #pto.address_space>>) -> index + %fifo_tile = pto.get_fifo_tile(%pipe, %slot_id : !pto.pipe>, memref<8x16xf32, #pto.address_space>>, index) + -> memref<8x16xf32, #pto.address_space> + pto.tmov ins(%fifo_tile : memref<8x16xf32, #pto.address_space>) outs(%vec_print : !pto.tile_buf) + pto.tprint ins(%vec_print : !pto.tile_buf) + pto.tfree(%pipe, %slot_id : !pto.pipe>, memref<8x16xf32, #pto.address_space>>, index) + } + } + + return + } +} diff --git a/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel_a5_manual.cpp b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel_a5_manual.cpp new file mode 100644 index 00000000..671b6785 --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/kernel_a5_manual.cpp @@ -0,0 +1,78 @@ +#include "pto/pto-inst.hpp" + +using namespace pto; + +__global__ AICORE void matmul_tpush_tpop_loop4_print(__gm__ float *gm_a, + __gm__ float *gm_b_all, + __gm__ float *gm_slot_buffer, + int32_t c2v_consumer_buf, + int32_t v2c_consumer_buf) +{ + (void)gm_slot_buffer; + (void)v2c_consumer_buf; + + int64_t base0 = 0; + int64_t base512 = 512; + + Tile matA; + TASSIGN(matA, base0); + Tile matB; + TASSIGN(matB, base512); + + Tile leftA; + TASSIGN(leftA, base0); + Tile rightB; + TASSIGN(rightB, base0); + + Tile accC; + TASSIGN(accC, base0); + Tile vecOut; + TASSIGN(vecOut, base0); + + auto pipe = TPipe<0, FIFOType::VEC_FIFO, 8, 8, + Tile, + Tile>(c2v_consumer_buf); + +#if defined(__DAV_CUBE__) + using GTShape = pto::Shape<1, 1, 1, 16, 16>; + using GTStride = pto::Stride<256, 256, 256, 16, 1>; + using GlobalFloat = GlobalTensor; + + GTShape shape = GTShape(); + GTStride stride = GTStride(); + + GlobalFloat gA(gm_a, shape, stride); + TLOAD(matA, gA); + TMOV(leftA, matA); + + for (int iter = 0; iter < 4; ++iter) { + GlobalFloat gB(gm_b_all + iter * 256, shape, stride); + TLOAD(matB, gB); + TMOV(rightB, matB); + TMATMUL(accC, leftA, rightB); + TPUSH(accC, pipe); + } +#endif + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + for (int iter = 0; iter < 4; ++iter) { + TPOP(vecOut, pipe); + TPRINT(vecOut); + TFREE(pipe); + } +#endif +} + +void LaunchMatmulTPushPopLoop4Print(uint8_t *a, uint8_t *b_all, uint8_t *slot, + int32_t c2vBuf, void *stream) +{ + constexpr int32_t v2cBuf = 0; + matmul_tpush_tpop_loop4_print<<<1, nullptr, stream>>>(reinterpret_cast(a), + reinterpret_cast(b_all), + reinterpret_cast(slot), + c2vBuf, v2cBuf); +} diff --git a/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/main.cpp b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/main.cpp new file mode 100644 index 00000000..777a37df --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_loop4_a5/main.cpp @@ -0,0 +1,82 @@ +#include "acl/acl.h" +#include +#include +#include +#include + +void LaunchMatmulTPushPopLoop4Print(uint8_t *a, uint8_t *b_all, uint8_t *slot, + int32_t c2vBuf, void *stream); + +#define ACL_CHECK(expr) \ + do { \ + aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ACL ERROR] %s failed: %d (%s:%d)\n", #expr, \ + (int)_ret, __FILE__, __LINE__); \ + return 1; \ + } \ + } while (0) + +int main() { + constexpr int M = 16; + constexpr int K = 16; + constexpr int N = 16; + constexpr int Iter = 4; + constexpr size_t aBytes = M * K * sizeof(float); + constexpr size_t bBytes = Iter * K * N * sizeof(float); + constexpr size_t slotBytes = M * N * sizeof(float); + + std::vector hostA(M * K, 0.0f); + std::vector hostBAll(Iter * K * N, 0.0f); + std::vector hostSlot(M * N, 0.0f); + for (int i = 0; i < M; ++i) { + hostA[i * K + i] = 1.0f; + } + for (int iter = 0; iter < Iter; ++iter) { + const float value = static_cast(iter + 1); + const size_t base = static_cast(iter) * K * N; + for (int idx = 0; idx < K * N; ++idx) { + hostBAll[base + idx] = value; + } + } + + ACL_CHECK(aclInit(nullptr)); + ACL_CHECK(aclrtSetDevice(0)); + + aclrtStream stream = nullptr; + ACL_CHECK(aclrtCreateStream(&stream)); + + uint8_t *devA = nullptr; + uint8_t *devBAll = nullptr; + uint8_t *devSlot = nullptr; + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devA), aBytes, + ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devBAll), bBytes, + ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devSlot), slotBytes, + ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(devA, aBytes, hostA.data(), aBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devBAll, bBytes, hostBAll.data(), bBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devSlot, slotBytes, hostSlot.data(), slotBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + + constexpr int32_t c2vBuf = 0x10000; + + LaunchMatmulTPushPopLoop4Print(devA, devBAll, devSlot, c2vBuf, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + std::puts("Kernel finished. Expect 4 TPRINT blocks with 8x16 outputs filled " + "with 1.0, 2.0, 3.0, 4.0 in order."); + + aclrtFree(devA); + aclrtFree(devBAll); + aclrtFree(devSlot); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); + + return 0; +}