diff --git a/docs/2026-03-07-tpush-tpop-op-interface-todo.md b/docs/2026-03-07-tpush-tpop-op-interface-todo.md new file mode 100644 index 00000000..04ccb541 --- /dev/null +++ b/docs/2026-03-07-tpush-tpop-op-interface-todo.md @@ -0,0 +1,108 @@ +# PTOAS 新增 OP 接口与 TODO + +## 1. 范围与目标 + +说明在PTOAS中新增的 OP 接口定义与参数语义,包括: + +- `pto.initialize_pipe` +- `pto.tpush(tile, pipe)` +- `pto.tpop(tile, pipe)` +- `pto.tfree(pipe)` + +并补充当前待完成TODO项。 + +## 2. OP 接口定义 + +### 2.1 `pto.initialize_pipe` + +用途:在函数级完成 ring buffer/pipe 句柄初始化,返回统一 `pipe` 句柄,供后续 `tpush/tpop` 显式传递。 + +概念签名: + +```mlir +%pipe = pto.initialize_pipe {dir_mask = , slot_size = } + (%gm_slot_buffer : , + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe +``` + +参数说明: + +| 参数 | 类型 | 说明 | 约束 | +|---|---|---|---| +| `dir_mask` | `i8 attr` | 方向掩码,`1`=C2V,`2`=V2C,`3`=双向 | 当前 PTOAS 中 `3` 暂不支持,直接报错 | +| `slot_size` | `i32 attr` | 单 slot 大小(字节) | 必须 `> 0` | +| `gm_slot_buffer` | `PTODpsType` | GM slot buffer 基址/句柄 | 必填 | +| `c2v_consumer_buf` | `i32` | A5 下 C2V consumer 侧本地 buffer 基址 | 必填 | +| `v2c_consumer_buf` | `i32` | A5 下 V2C consumer 侧本地 buffer 基址 | 必填 | +| 返回值 `pipe` | `!pto.pipe` | 统一 pipe 句柄 | `location/depth/numBuffers` 由 `initialize_pipe` 参数推导 | + +### 2.2 `pto.tpush(tile, pipe)` + +用途:生产者将 tile 按 `pipe` 描述写入 ring buffer。 + +概念签名: + +```mlir +pto.tpush(%tile, %pipe : , !pto.pipe) +``` + +参数说明: + +| 参数 | 类型 | 说明 | 约束 | +|---|---|---|---| +| `tile` | `PTODpsType` | 生产者要推送的 tile 变量 | `tile` 类型必须匹配 `pipe.src_tile_type` | +| `pipe` | `!pto.pipe` | `initialize_pipe` 返回的 pipe 句柄 | 必须显式传入,不能隐式推导 | + +### 2.3 `pto.tpop(tile, pipe)` + +用途:消费者从 `pipe` 读取 tile 数据到目标 `tile` 变量。 + +概念签名: + +```mlir +pto.tpop(%tile, %pipe : , !pto.pipe) +``` + +参数说明: + +| 参数 | 类型 | 说明 | 约束 | +|---|---|---|---| +| `tile` | `PTODpsType` | 消费者接收数据的 tile 变量 | `tile` 类型必须匹配 `pipe.dst_tile_type` | +| `pipe` | `!pto.pipe` | `initialize_pipe` 返回的 pipe 句柄 | 必须显式传入,不能隐式推导 | + +语义备注: + +- `tpop` 只负责”获取 slot + 读取数据”。 +- slot 释放由独立的 `pto.tfree` 完成(仅 A5 架构需要,见 2.4)。 + +### 2.4 `pto.tfree(pipe)` + +用途:显式释放 `tpop` 占用的 pipe slot。仅 A5 架构需要——A5 使用 Local buffer 作为 push/pop 数据传递介质,`tpop` 后数据仍在 slot 中供后续计算读取,必须等消费者用完后才能释放。A2A3 使用 Global Memory 通信,`tpop` 已将数据拷贝至本地内存,slot 可立即释放,因此 `tfree` 在 A2A3 上为空操作(EmitC 直接擦除)。 + +概念签名: + +```mlir +pto.tfree(%pipe : !pto.pipe) +``` + +参数说明: + +| 参数 | 类型 | 说明 | 约束 | +|---|---|---|---| +| `pipe` | `!pto.pipe` | `initialize_pipe` 返回的 pipe 句柄 | 必须与对应 `tpop` 使用同一 pipe | + +约束与行为: + +- 必须在 `section.cube` 或 `section.vector` 内部使用。 +- 每个 `tpop` 应对应一个 `tfree`,使用相同的 `pipe_handle`。 +- `InsertTFreePass`(仅 A5)会在 `tpop` 的 tile 数据最后一次被读取之后自动插入 `tfree`;已有手写 `tfree` 的 `tpop` 会被跳过。 +- EmitC 降低:A5 生成 `TFREE(...)`,A2A3 擦除该 op。 + +## 3. TODO(当前版本) + +### T1. FlagID 分配策略重构 + +- 当前(`0,2,4,6,8,10,12`)的线性分配策略较简单。 +- 应该在 kernel 函数范围内进行分析和分配。 diff --git a/include/PTO/IR/PTOAttrs.td b/include/PTO/IR/PTOAttrs.td index 343a3b76..4fea96b6 100644 --- a/include/PTO/IR/PTOAttrs.td +++ b/include/PTO/IR/PTOAttrs.td @@ -437,4 +437,27 @@ def TileBufConfigAttr : AttrDef { }]; } +//===----------------------------------------------------------------------===// +// Pipe Location (for TPUSH/TPOP unified pipe handle) +//===----------------------------------------------------------------------===// + +def PTO_PipeLocation_GM : I32EnumAttrCase<"GM", 0, "gm">; +def PTO_PipeLocation_LOCAL : I32EnumAttrCase<"LOCAL", 1, "local">; + +def PTO_PipeLocationEnum : PTO_I32Enum< + "PipeLocation", "PTO pipe physical location", [ + PTO_PipeLocation_GM, + PTO_PipeLocation_LOCAL + ]>; + +def PTO_PipeLocationAttr : PTO_Attr<"PipeLocation", "pipe_location"> { + let parameters = (ins EnumParameter:$location); + let assemblyFormat = "`<` params `>`"; + let description = [{ + Physical location of cross-core pipe buffer. + GM: pipe in global memory. + LOCAL: pipe in on-chip local memory (UB/L1). + }]; +} + #endif // MLIR_DIALECT_PTO_IR_PTOATTRS diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index 2b57228f..0fe4e983 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -3645,4 +3645,118 @@ def TPrintOp: PTO_TOp<"tprint", [ }]; } +//===----------------------------------------------------------------------===// +// TPUSH/TPOP Ring Buffer Communication Ops +//===----------------------------------------------------------------------===// + +// --- Initialization --- + +def InitializePipeOp : PTO_Op<"initialize_pipe", [ + DeclareOpInterfaceMethods +]> { + let summary = "Initialize ring buffer pipe handle"; + let description = [{ + Called once at kernel startup. Binds ring buffer pipe to backing memory, + computes slot configuration from dir_mask, and returns a pipe handle. + }]; + + let arguments = (ins + I8Attr:$dir_mask, + I32Attr:$slot_size, + PTODpsType:$gm_slot_buffer, + I32:$c2v_consumer_buf, + I32:$v2c_consumer_buf + ); + + let results = (outs PipeType:$pipe); + let hasVerifier = 1; + + let assemblyFormat = [{ + `{` `dir_mask` `=` $dir_mask `,` `slot_size` `=` $slot_size `}` + `(` $gm_slot_buffer `:` qualified(type($gm_slot_buffer)) `,` + $c2v_consumer_buf `:` type($c2v_consumer_buf) `,` + $v2c_consumer_buf `:` type($v2c_consumer_buf) + `)` attr-dict `->` qualified(type($pipe)) + }]; +} + +// --- Data Transfer: Push (producer, no DPS) --- + +def TPushOp : PTO_TOp<"tpush", [ + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Push tile data via unified pipe handle"; + + let arguments = (ins + PTODpsType:$tile, + PipeType:$pipe_handle + ); + + let results = (outs); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $tile `,` $pipe_handle `:` qualified(type($tile)) `,` qualified(type($pipe_handle)) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } + }]; +} + +// --- Data Transfer: Pop (consumer, DPS) --- + +def TPopOp : PTO_TOp<"tpop", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Pop tile data via unified pipe handle"; + + let arguments = (ins + PTODpsType:$tile, + PipeType:$pipe_handle + ); + + let results = (outs); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $tile `,` $pipe_handle `:` qualified(type($tile)) `,` qualified(type($pipe_handle)) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getTileMutable(); } + }]; +} + +// --- Data Transfer: Free (consumer slot release) --- + +def TFreeOp : PTO_TOp<"tfree", [ + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Release pipe slot after consumer finishes using data"; + + let arguments = (ins + PipeType:$pipe_handle + ); + + let results = (outs); + let hasVerifier = 1; + + let assemblyFormat = [{ + `(` $pipe_handle `:` qualified(type($pipe_handle)) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_MTE1; } + }]; +} + #endif // MLIR_DIALECT_PTO_IR_PTOOPS diff --git a/include/PTO/IR/PTOTypeDefs.td b/include/PTO/IR/PTOTypeDefs.td index d4a79746..4a3bc569 100644 --- a/include/PTO/IR/PTOTypeDefs.td +++ b/include/PTO/IR/PTOTypeDefs.td @@ -184,3 +184,15 @@ def TileBufType : TypeDef { int32_t getPadValueI32() const; // 0 null, 1 zero, 2 max, 3 min }]; } + +def PipeType : TypeDef { + let mnemonic = "pipe"; + let summary = "Pipe handle type for TPUSH/TPOP unified static schedule"; + let parameters = (ins + "mlir::Type":$srcTileType, + "mlir::Type":$dstTileType + ); + let assemblyFormat = [{ + `<` $srcTileType `,` $dstTileType `>` + }]; +} diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h index 3df9390d..cf60d400 100644 --- a/include/PTO/Transforms/Passes.h +++ b/include/PTO/Transforms/Passes.h @@ -65,6 +65,8 @@ std::unique_ptr createPTOViewToMemrefPass(); std::unique_ptr createPTOInsertLoadStoreForMixCVPass(); std::unique_ptr createInferPTOLayoutPass(); // Declare register function +std::unique_ptr createPTOInsertTFreePass(); + void registerPTOPasses(); } // namespace pto diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index ab7f29df..18d92d4e 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -116,4 +116,21 @@ def PTOLoweringSyncToPipe : Pass<"pto-lowering-sync-to-pipe", "func::FuncOp"> { ]; } +def PTOInsertTFree : Pass<"pto-insert-tfree", "func::FuncOp"> { + let summary = "Auto-insert pto.tfree after last use of tpop tile data (A5 only)"; + let description = [{ + For each pto.tpop in section.cube / section.vector, analyzes the + data dependency of the popped tile and inserts pto.tfree(pipe_handle) + at the earliest safe point — immediately after the last read of the tile. + Skips tpop ops that already have a matching tfree. + Only meaningful on A5 architecture where tpop/tfree split protocol is used. + }]; + + let constructor = "mlir::pto::createPTOInsertTFreePass()"; + + let dependentDialects = [ + "mlir::pto::PTODialect" + ]; +} + #endif // MLIR_DIALECT_PTO_PASSES diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 73ff0cb0..d9ae2967 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -2006,6 +2006,73 @@ LogicalResult RlsBufOp::verify() { return verifyBufSyncOp(getOperation(), getPipe(), getBufIdAttr(), getModeAttr()); } + +//===----------------------------------------------------------------------===// +// TPUSH/TPOP Op Verifiers +//===----------------------------------------------------------------------===// + +static bool isInsideSection(Operation *op) { + return op->getParentOfType() || + op->getParentOfType(); +} + +static bool isInsideSectionCube(Operation *op) { + return op->getParentOfType() != nullptr; +} + +static bool isInsideSectionVector(Operation *op) { + return op->getParentOfType() != nullptr; +} + +static LogicalResult verifyInitializePipe(Operation *op, int8_t dirMask, + int32_t slotSize, Type pipeType) { + if (isInsideSection(op)) + return op->emitOpError("must be at function level, not inside a section"); + if (dirMask == 3) + return op->emitOpError("bidirectional DIRMASK is not supported"); + if (dirMask != 1 && dirMask != 2) + return op->emitOpError("dir_mask must be 1 (C2V) or 2 (V2C)"); + if (slotSize <= 0) + return op->emitOpError("slot_size must be positive"); + auto pipeTy = dyn_cast(pipeType); + if (!pipeTy) + return op->emitOpError("result type must be !pto.pipe<...>"); + return success(); +} + +LogicalResult InitializePipeOp::verify() { + return verifyInitializePipe(getOperation(), getDirMask(), getSlotSize(), + getPipe().getType()); +} + +static LogicalResult verifyPipeTileType(Operation *op, Type pipeType, + Type tileType, bool isPush) { + auto pipeTy = dyn_cast(pipeType); + if (!pipeTy) + return op->emitOpError("expects pipe operand type !pto.pipe<...>"); + + Type expected = isPush ? pipeTy.getSrcTileType() : pipeTy.getDstTileType(); + if (tileType != expected) { + return op->emitOpError(isPush ? "tile type must match pipe src tile type" + : "tile type must match pipe dst tile type"); + } + return success(); +} + +LogicalResult TPushOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + return verifyPipeTileType(getOperation(), getPipeHandle().getType(), + getTile().getType(), /*isPush=*/true); +} + +LogicalResult TPopOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + return verifyPipeTileType(getOperation(), getPipeHandle().getType(), + getTile().getType(), /*isPush=*/false); +} + // ---- TOp ---- LogicalResult TGemvBiasOp::verify() { if (getPTOTypeRank(getA().getType()) == -1 || @@ -4447,6 +4514,44 @@ void TMatmulMxBiasOp::getEffects(SmallVectorImpl> + &effects) { + addEffect(effects, &getGmSlotBufferMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getC2vConsumerBufMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getV2cConsumerBufMutable(), MemoryEffects::Read::get()); + addEffect(effects, getOperation()->getOpResult(0), MemoryEffects::Write::get()); +} + +void TPushOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getTileMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); +} + +void TPopOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); + addEffect(effects, &getTileMutable(), MemoryEffects::Write::get()); +} + +LogicalResult TFreeOp::verify() { + if (!isInsideSectionCube(getOperation()) && !isInsideSectionVector(getOperation())) + return emitOpError("must be inside a section.cube or section.vector"); + return success(); +} + +void TFreeOp::getEffects( + SmallVectorImpl> + &effects) { + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Read::get()); + addEffect(effects, &getPipeHandleMutable(), MemoryEffects::Write::get()); +} + // [Include 必须放在最后] #include "PTO/IR/PTOInterfaces.cpp.inc" #define GET_OP_CLASSES diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt index d9d013c9..a1d92f78 100644 --- a/lib/PTO/Transforms/CMakeLists.txt +++ b/lib/PTO/Transforms/CMakeLists.txt @@ -26,6 +26,7 @@ add_mlir_dialect_library(PTOTransforms InsertSync/SyncEventIdAllocation.cpp InsertSync/SyncCodegen.cpp LoweringSyncToPipe.cpp + PTOInsertTFreePass.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include/PTO diff --git a/lib/PTO/Transforms/PTOInsertTFreePass.cpp b/lib/PTO/Transforms/PTOInsertTFreePass.cpp new file mode 100644 index 00000000..98b7c352 --- /dev/null +++ b/lib/PTO/Transforms/PTOInsertTFreePass.cpp @@ -0,0 +1,102 @@ +#include "PTO/Transforms/Passes.h" +#include "PTO/IR/PTO.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Builders.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace pto { +namespace func = ::mlir::func; +#define GEN_PASS_DEF_PTOINSERTTFREE +#include "PTO/Transforms/Passes.h.inc" +} // namespace pto +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +namespace { + +/// Check whether `tpopOp` already has a matching tfree in the same block. +static bool alreadyHasTFree(TPopOp tpopOp) { + Value pipeHandle = tpopOp.getPipeHandle(); + Block *block = tpopOp->getBlock(); + for (auto it = std::next(tpopOp->getIterator()), end = block->end(); + it != end; ++it) { + if (auto tfreeOp = dyn_cast(&*it)) { + if (tfreeOp.getPipeHandle() == pipeHandle) + return true; + } + } + return false; +} + +/// Find the last operation in the same block (after startOp) that reads `tile` +/// via MemoryEffectsOpInterface. +static Operation *findLastReadOf(Value tile, Operation *startOp) { + Block *block = startOp->getBlock(); + Operation *lastRead = nullptr; + + for (auto it = std::next(startOp->getIterator()), end = block->end(); + it != end; ++it) { + Operation *op = &*it; + + // Check via MemoryEffectsOpInterface. + if (auto memEffect = dyn_cast(op)) { + SmallVector, 4> + effects; + memEffect.getEffects(effects); + for (auto &effect : effects) { + if (effect.getValue() == tile && + isa(effect.getEffect())) { + lastRead = op; + break; + } + } + } + } + + return lastRead; +} + +struct PTOInsertTFreePass + : public mlir::pto::impl::PTOInsertTFreeBase { + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + + // Collect tpop ops first to avoid iterator invalidation. + SmallVector tpops; + funcOp.walk([&](TPopOp op) { tpops.push_back(op); }); + + for (TPopOp tpopOp : tpops) { + // Skip if already has a matching tfree. + if (alreadyHasTFree(tpopOp)) + continue; + + // Must be inside a section. + if (!tpopOp->getParentOfType() && + !tpopOp->getParentOfType()) + continue; + + Value tile = tpopOp.getTile(); + Value pipeHandle = tpopOp.getPipeHandle(); + + // Find the last read of the tile after the tpop. + Operation *lastRead = findLastReadOf(tile, tpopOp.getOperation()); + + // Insert tfree after the last read, or right after tpop if no reads. + Operation *insertAfter = lastRead ? lastRead : tpopOp.getOperation(); + OpBuilder builder(insertAfter->getContext()); + builder.setInsertionPointAfter(insertAfter); + builder.create(tpopOp.getLoc(), pipeHandle); + } + } +}; + +} // namespace + +std::unique_ptr mlir::pto::createPTOInsertTFreePass() { + return std::make_unique(); +} diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp index 24d24171..1bfbe8f0 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.cpp +++ b/lib/PTO/Transforms/PTOPlanMemory.cpp @@ -160,6 +160,9 @@ void MemLivenessAnalysis::RecursionIR(Region *region, Liveness live) { UpdateOpGenInfo(curOpInfo, llvm::to_vector(gpuLaunchOp->getOperands())); // UpdateOpTempGenInfo(curOpInfo); OpKillHandle(curOpInfo, live, op->getBlock()); + } else if (isa(op)) { + UpdateOpGenInfo(curOpInfo, llvm::to_vector(op->getOperands())); + OpKillHandle(curOpInfo, live, op->getBlock()); } else if (failed(CheckIfUnknownOpTouchBuffer(op))) { return WalkResult::interrupt(); } diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index d1d43f08..ed481ec9 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -79,6 +79,158 @@ static const char *addrSpaceQualifier(pto::AddressSpace as) { return "__gm__"; } +static std::string getEmitCElementTypeToken(Type elemTy) { + if (elemTy.isF16()) + return "half"; + if (elemTy.isBF16()) + return "bfloat16_t"; + if (elemTy.isF32()) + return "float"; + if (elemTy.isF64()) + return "double"; + if (elemTy.isInteger(8)) { + auto intTy = cast(elemTy); + return (intTy.isSignless() || intTy.isSigned()) ? "int8_t" : "uint8_t"; + } + if (elemTy.isInteger(16)) { + auto intTy = cast(elemTy); + return (intTy.isSignless() || intTy.isSigned()) ? "int16_t" : "uint16_t"; + } + if (elemTy.isInteger(32)) { + auto intTy = cast(elemTy); + return (intTy.isSignless() || intTy.isSigned()) ? "int32_t" : "uint32_t"; + } + if (elemTy.isInteger(64)) { + auto intTy = cast(elemTy); + return intTy.isUnsigned() ? "uint64_t" : "int64_t"; + } + return ""; +} + +static const char *getEmitCTileRoleToken(pto::AddressSpace as) { + switch (as) { + case pto::AddressSpace::VEC: + return "TileType::Vec"; + case pto::AddressSpace::MAT: + return "TileType::Mat"; + case pto::AddressSpace::LEFT: + return "TileType::Left"; + case pto::AddressSpace::RIGHT: + return "TileType::Right"; + case pto::AddressSpace::ACC: + return "TileType::Acc"; + case pto::AddressSpace::BIAS: + return "TileType::Bias"; + case pto::AddressSpace::SCALING: + return "TileType::Scaling"; + case pto::AddressSpace::GM: + case pto::AddressSpace::Zero: + return "TileType::Vec"; + } + return "TileType::Vec"; +} + +struct EmitCTileLayoutDefaults { + const char *blTok; + const char *slTok; + int fractalBytes; + const char *padTok; +}; + +static EmitCTileLayoutDefaults +getEmitCTileLayoutDefaults(pto::AddressSpace as, PTOArch targetArch) { + switch (as) { + case pto::AddressSpace::ACC: + return {"BLayout::ColMajor", "SLayout::RowMajor", 1024, "PadValue::Null"}; + case pto::AddressSpace::RIGHT: + return {"BLayout::RowMajor", "SLayout::ColMajor", 512, "PadValue::Null"}; + case pto::AddressSpace::LEFT: + if (targetArch == PTOArch::A3) { + // pto-isa A2/A3: TileLeft -> BLayout::RowMajor + SLayout::RowMajor. + return {"BLayout::RowMajor", "SLayout::RowMajor", 512, + "PadValue::Null"}; + } + // pto-isa A5: TileLeft -> BLayout::ColMajor + SLayout::RowMajor. + return {"BLayout::ColMajor", "SLayout::RowMajor", 512, "PadValue::Null"}; + case pto::AddressSpace::MAT: + return {"BLayout::ColMajor", "SLayout::RowMajor", 512, "PadValue::Null"}; + case pto::AddressSpace::VEC: + case pto::AddressSpace::BIAS: + case pto::AddressSpace::SCALING: + case pto::AddressSpace::GM: + case pto::AddressSpace::Zero: + return {"BLayout::RowMajor", "SLayout::NoneBox", 512, "PadValue::Null"}; + } + return {"BLayout::RowMajor", "SLayout::NoneBox", 512, "PadValue::Null"}; +} + +static FailureOr getEmitCTileTypeTokenFromType(Type ty, + PTOArch targetArch) { + auto mrTy = dyn_cast(ty); + if (!mrTy || mrTy.getRank() < 2 || !mrTy.hasStaticShape()) + return failure(); + + int64_t rows = mrTy.getDimSize(0); + int64_t cols = mrTy.getDimSize(1); + if (rows == ShapedType::kDynamic || cols == ShapedType::kDynamic) + return failure(); + + std::string elemTok = getEmitCElementTypeToken(mrTy.getElementType()); + if (elemTok.empty()) + return failure(); + + pto::AddressSpace as = pto::AddressSpace::VEC; + if (auto asAttr = + dyn_cast_or_null(mrTy.getMemorySpace())) + as = asAttr.getAddressSpace(); + + const char *roleTok = getEmitCTileRoleToken(as); + auto defaults = getEmitCTileLayoutDefaults(as, targetArch); + return std::string("Tile<") + roleTok + ", " + elemTok + ", " + + std::to_string(rows) + ", " + std::to_string(cols) + + ", " + defaults.blTok + ", " + std::to_string(rows) + ", " + + std::to_string(cols) + ", " + defaults.slTok + ", " + + std::to_string(defaults.fractalBytes) + ", " + defaults.padTok + ">"; +} + +static FailureOr allocateFlagBaseForInitOp(mlir::pto::InitializePipeOp op) { + static constexpr int kFlagBasePool[] = {0, 2, 4, 6, 8, 10, 12, 14}; + static constexpr int kFlagBasePoolSize = 8; + int idx = 0; + for (Operation &candidate : *op->getBlock()) { + if (auto init = dyn_cast(&candidate)) { + if (init == op) + break; + ++idx; + } + } + if (idx < 0 || idx >= kFlagBasePoolSize) + return failure(); + return kFlagBasePool[idx]; +} + +static FailureOr +getTPipePipeTypeToken(int8_t dirMask, PTOArch targetArch) { + if (targetArch == PTOArch::A3) + return std::string("FIFOType::GM_FIFO"); + if (targetArch == PTOArch::A5) { + if (dirMask == 1) + return std::string("FIFOType::VEC_FIFO"); + if (dirMask == 2) + return std::string("FIFOType::MAT_FIFO"); + } + return failure(); +} + +static std::pair +getTPipeSyncTypeTokens(const std::string &pipeTypeToken) { + if (pipeTypeToken == "FIFOType::GM_FIFO") + return {"TSyncOpType::TSTORE_C2GM_UFOFF", "TSyncOpType::TLOAD"}; + if (pipeTypeToken == "FIFOType::MAT_FIFO") + return {"TSyncOpType::TINSERT_V2L1", "TSyncOpType::NONE"}; + return {"TSyncOpType::TMOV_C2UB", "TSyncOpType::NONE"}; +} + static Value peelUnrealized(Value v) { if (auto castOp = v.getDefiningOp()) return castOp.getOperand(0); @@ -231,6 +383,13 @@ class PTOToEmitCTypeConverter : public TypeConverter { emitc::OpaqueType::get(Ctx, finalTypeStr)); }); + addConversion([Ctx](pto::PipeType type) -> Type { + // `initialize_pipe` returns a Pipe-dependent handle type in generated C++. + // Keep it as `auto` to avoid hard-coding a non-existent concrete pipe type. + (void)type; + return emitc::OpaqueType::get(Ctx, "auto"); + }); + // --------------------------------------------------------- // 3. MemRef 转换 (Debug 重点) // --------------------------------------------------------- @@ -2851,6 +3010,11 @@ struct PointerCastConversion : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; + PointerCastConversion(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + enum class TileRole { Vec, Mat, Left, Right, Acc, Bias, Scaling }; static void collectUserOpsThroughCasts(Value v, SmallVectorImpl &out) { @@ -2909,6 +3073,26 @@ struct PointerCastConversion : public OpConversionPattern { return TileRole::Vec; } + EmitCTileLayoutDefaults getDefaultLayoutForRole(TileRole role) const { + switch (role) { + case TileRole::Left: + return getEmitCTileLayoutDefaults(pto::AddressSpace::LEFT, targetArch); + case TileRole::Right: + return getEmitCTileLayoutDefaults(pto::AddressSpace::RIGHT, targetArch); + case TileRole::Acc: + return getEmitCTileLayoutDefaults(pto::AddressSpace::ACC, targetArch); + case TileRole::Bias: + return getEmitCTileLayoutDefaults(pto::AddressSpace::BIAS, targetArch); + case TileRole::Mat: + return getEmitCTileLayoutDefaults(pto::AddressSpace::MAT, targetArch); + case TileRole::Scaling: + return getEmitCTileLayoutDefaults(pto::AddressSpace::SCALING, targetArch); + case TileRole::Vec: + return getEmitCTileLayoutDefaults(pto::AddressSpace::VEC, targetArch); + } + return getEmitCTileLayoutDefaults(pto::AddressSpace::VEC, targetArch); + } + // [新增] 辅助函数:判断 Value 是否源自 arith.constant static bool isConstant(Value v, int64_t &outVal) { if (!v) return false; @@ -2964,9 +3148,12 @@ struct PointerCastConversion : public OpConversionPattern { case TileRole::Scaling: roleTok = "TileType::Scaling"; break; } - // 4. Config & Layout (support BLayoutAttr/SLayoutAttr/PadValueAttr after namespace change) - std::string layoutParams = "BLayout::RowMajor"; - std::string extraParams = ""; + // 4. Config & Layout (default by tile role; explicit config overrides) + auto defaults = getDefaultLayoutForRole(role); + std::string layoutParams = defaults.blTok; + std::string slStr = defaults.slTok; + int32_t frVal = defaults.fractalBytes; + std::string padStr = defaults.padTok; if (auto configOpt = op.getConfig()) { auto config = *configOpt; int32_t blVal = 0; @@ -2979,25 +3166,21 @@ struct PointerCastConversion : public OpConversionPattern { if (auto attr = dyn_cast(config.getSLayout())) slVal = static_cast(attr.getValue()); - std::string slStr = (slVal == 1) ? "SLayout::RowMajor" : (slVal == 2) ? "SLayout::ColMajor" : "SLayout::NoneBox"; + slStr = (slVal == 1) ? "SLayout::RowMajor" : (slVal == 2) ? "SLayout::ColMajor" : "SLayout::NoneBox"; - int32_t frVal = 0; + frVal = 0; if (auto attr = dyn_cast(config.getSFractalSize())) frVal = attr.getInt(); int32_t padVal = 0; if (auto attr = dyn_cast(config.getPad())) padVal = static_cast(attr.getValue()); - std::string padStr = "PadValue::Null"; + padStr = "PadValue::Null"; switch (padVal) { case 1: padStr = "PadValue::Zero"; break; case 2: padStr = "PadValue::Max"; break; case 3: padStr = "PadValue::Min"; break; } - - if (!slStr.empty()) { - extraParams += ", " + slStr + ", " + std::to_string(frVal) + ", " + padStr; - } } // [核心修改] Valid Dims 处理逻辑 (支持混合静态/动态) @@ -3055,7 +3238,8 @@ struct PointerCastConversion : public OpConversionPattern { // 5. 生成 Tile 类型字符串 std::string tileTypeStr = std::string("Tile<") + roleTok + ", " + elemTypeStr + ", " + dimStr + ", " + - layoutParams + ", " + vrowTok + ", " + vcolTok + extraParams + ">"; + layoutParams + ", " + vrowTok + ", " + vcolTok + ", " + slStr + ", " + + std::to_string(frVal) + ", " + padStr + ">"; auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); Value resultValue; @@ -3103,6 +3287,9 @@ struct PointerCastConversion : public OpConversionPattern { rewriter.replaceOp(op, resultValue); return success(); } + +private: + PTOArch targetArch; }; //===----------------------------------------------------------------------===// @@ -3672,6 +3859,136 @@ struct PTORlsBufToEmitC : public OpConversionPattern { } }; +struct PTOInitializePipeToEmitC + : public OpConversionPattern { + PTOInitializePipeToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + + LogicalResult matchAndRewrite(mlir::pto::InitializePipeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto pipeTy = dyn_cast(op.getPipe().getType()); + if (!pipeTy) + return rewriter.notifyMatchFailure(op, "expected !pto.pipe result type"); + + int8_t dirMask = static_cast(op.getDirMask()); + if (dirMask == 3) + return rewriter.notifyMatchFailure( + op, "bidirectional DIRMASK is not supported in unified v3"); + if (dirMask != 1 && dirMask != 2) + return rewriter.notifyMatchFailure(op, "unsupported dir_mask"); + + auto flagBase = allocateFlagBaseForInitOp(op); + if (failed(flagBase)) + return rewriter.notifyMatchFailure(op, "insufficient FlagID pairs"); + + auto pipeTypeToken = getTPipePipeTypeToken(dirMask, targetArch); + if (failed(pipeTypeToken)) + return rewriter.notifyMatchFailure(op, "failed to map PipeType"); + + auto syncTokens = getTPipeSyncTypeTokens(*pipeTypeToken); + int slotNum = (dirMask == 3) ? 4 : 8; + auto srcTok = + getEmitCTileTypeTokenFromType(pipeTy.getSrcTileType(), targetArch); + if (failed(srcTok)) + return rewriter.notifyMatchFailure( + op, "failed to map pipe src type to Tile<...> token"); + auto dstTok = + getEmitCTileTypeTokenFromType(pipeTy.getDstTileType(), targetArch); + if (failed(dstTok)) + return rewriter.notifyMatchFailure( + op, "failed to map pipe dst type to Tile<...> token"); + std::string tpipeTok = + "TPipe<" + std::to_string(*flagBase) + ", " + *pipeTypeToken + ", " + + std::to_string(slotNum) + ", " + std::to_string(slotNum) + ", " + + *srcTok + ", " + *dstTok + ", " + syncTokens.first + ", " + + syncTokens.second + ", VecCubeRatio::V2C1_VECS>"; + + Value pipeHandle; + if (*pipeTypeToken == "FIFOType::GM_FIFO") { + pipeHandle = peelUnrealized(adaptor.getGmSlotBuffer()); + } else if (*pipeTypeToken == "FIFOType::VEC_FIFO") { + pipeHandle = peelUnrealized(adaptor.getC2vConsumerBuf()); + } else { + pipeHandle = peelUnrealized(adaptor.getV2cConsumerBuf()); + } + + auto emitPipeTy = cast(getTypeConverter()->convertType(op.getPipe().getType())); + rewriter.replaceOpWithNewOp( + op, TypeRange{emitPipeTy}, tpipeTok, + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{pipeHandle}); + return success(); + } + + PTOArch targetArch; +}; + +struct PTOTPushToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::TPushOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "TPUSH", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{ + peelUnrealized(adaptor.getTile()), + peelUnrealized(adaptor.getPipeHandle()), + }); + return success(); + } +}; + +struct PTOTPopToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(mlir::pto::TPopOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "TPOP", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{ + peelUnrealized(adaptor.getTile()), + peelUnrealized(adaptor.getPipeHandle()), + }); + return success(); + } +}; + +struct PTOTFreeToEmitC + : public OpConversionPattern { + PTOTFreeToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} + + LogicalResult matchAndRewrite(mlir::pto::TFreeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (targetArch == PTOArch::A5) { + rewriter.replaceOpWithNewOp( + op, TypeRange{}, "TFREE", + /*args=*/ArrayAttr{}, + /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{ + peelUnrealized(adaptor.getPipeHandle()), + }); + } else { + // A2A3: tfree is a no-op, slot already released by tpop + rewriter.eraseOp(op); + } + return success(); + } + + PTOArch targetArch; +}; + struct PTOSyncSetToEmitC : public OpConversionPattern { PTOSyncSetToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, PTOArch targetArch) @@ -3965,7 +4282,10 @@ struct PTOTAddToTADD : public OpConversionPattern { // populate patterns //===----------------------------------------------------------------------=== struct ReinterpretCastToEmitC : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; + ReinterpretCastToEmitC(TypeConverter &typeConverter, MLIRContext *ctx, + PTOArch targetArch) + : OpConversionPattern(typeConverter, ctx), + targetArch(targetArch) {} LogicalResult matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { @@ -4058,7 +4378,11 @@ struct ReinterpretCastToEmitC : public OpConversionPattern"; + ", " + defaults.blTok + ", " + std::to_string(rows) + ", " + + std::to_string(cols) + ", " + defaults.slTok + ", " + + std::to_string(defaults.fractalBytes) + ", " + defaults.padTok + ">"; auto tileType = emitc::OpaqueType::get(ctx, tileTypeStr); Value tile = rewriter @@ -4135,6 +4460,9 @@ struct ReinterpretCastToEmitC : public OpConversionPattern TADDC(dst, src0, src1, src2) @@ -7220,6 +7548,10 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); @@ -7282,7 +7614,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); @@ -7353,7 +7685,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx, targetArch); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); @@ -7392,6 +7724,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, populateCallOpTypeConversionPattern(patterns, typeConverter); } +//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Pass //===----------------------------------------------------------------------===// diff --git a/test/basic/insert_tfree_a2a3_noop.mlir b/test/basic/insert_tfree_a2a3_noop.mlir new file mode 100644 index 00000000..5aeb9fe5 --- /dev/null +++ b/test/basic/insert_tfree_a2a3_noop.mlir @@ -0,0 +1,34 @@ +// RUN: ptoas %s | FileCheck %s +// +// A2A3 architecture: InsertTFreePass should NOT run, no tfree in output. + +module { + func.func @a2a3_no_auto_tfree( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + // No tfree — A2A3 doesn't need it + } + + return + } +} + +// CHECK: TPOP( +// CHECK-NOT: TFREE diff --git a/test/basic/insert_tfree_basic.mlir b/test/basic/insert_tfree_basic.mlir new file mode 100644 index 00000000..8a237758 --- /dev/null +++ b/test/basic/insert_tfree_basic.mlir @@ -0,0 +1,48 @@ +// RUN: ptoas --pto-arch=a5 %s | FileCheck %s +// +// A pto.tpop followed by a compute op that reads the popped tile, +// but NO hand-written pto.tfree. The InsertTFree pass should +// auto-insert pto.tfree after the last use of vec_tile. + +module { + func.func @auto_tfree( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + // A second vec tile for the abs destination + %vec_mem2 = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile2 = pto.bind_tile %vec_mem2, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + // Pop data into vec_tile + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + + // Compute op that reads vec_tile — this is the last use + pto.tabs ins(%vec_tile : memref<32x128xf32, #pto.address_space>) outs(%vec_tile2 : memref<32x128xf32, #pto.address_space>) + + // NO pto.tfree here — the pass should insert it automatically + } + + return + } +} + +// CHECK: TPOP( +// CHECK: TABS( +// CHECK: TFREE( diff --git a/test/basic/insert_tfree_no_uses.mlir b/test/basic/insert_tfree_no_uses.mlir new file mode 100644 index 00000000..f99db0a7 --- /dev/null +++ b/test/basic/insert_tfree_no_uses.mlir @@ -0,0 +1,34 @@ +// RUN: ptoas --pto-arch=a5 %s | FileCheck %s +// +// tpop with no subsequent tile reads — tfree should be inserted right after tpop. + +module { + func.func @tfree_no_uses( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + // tpop but tile is never used after — tfree right after + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + return + } +} + +// CHECK: TPOP( +// CHECK: TFREE( diff --git a/test/basic/insert_tfree_skip_existing.mlir b/test/basic/insert_tfree_skip_existing.mlir new file mode 100644 index 00000000..324f2d49 --- /dev/null +++ b/test/basic/insert_tfree_skip_existing.mlir @@ -0,0 +1,37 @@ +// RUN: ptoas --pto-arch=a5 %s | FileCheck %s +// +// Verify that InsertTFreePass skips tpop ops that already have a hand-written tfree. + +module { + func.func @skip_existing_tfree( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + // Hand-written tfree — pass should skip this tpop + pto.tfree(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + return + } +} + +// Expect exactly one TFREE, not two +// CHECK: TPOP( +// CHECK: TFREE( +// CHECK-NOT: TFREE( diff --git a/test/basic/tfree_emitc_a2a3.mlir b/test/basic/tfree_emitc_a2a3.mlir new file mode 100644 index 00000000..fa9de3f5 --- /dev/null +++ b/test/basic/tfree_emitc_a2a3.mlir @@ -0,0 +1,34 @@ +// RUN: ptoas %s | FileCheck %s + +module { + func.func @tfree_a2a3_erased( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + pto.tfree(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + return + } +} + +// CHECK: TPipe<{{.*}}Tile>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %acc_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %cube_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %cube_tile = pto.bind_tile %cube_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe_c2v = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + %pipe_v2c = pto.initialize_pipe {dir_mask = 2, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<64x128xf32, #pto.address_space>> + + // C2V: acc pushes, vector pops + frees + pto.section.cube { + pto.tpush(%acc_tile, %pipe_c2v : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + pto.section.vector { + pto.tpop(%vec_tile, %pipe_c2v : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + pto.tfree(%pipe_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + // V2C: vector pushes, cube pops + frees + pto.section.vector { + pto.tpush(%vec_tile, %pipe_v2c : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + pto.section.cube { + pto.tpop(%cube_tile, %pipe_v2c : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + pto.tfree(%pipe_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + + return + } +} + +// CHECK: TPipe<{{.*}}Tile +// CHECK: TPUSH( +// CHECK: TPOP( +// CHECK: TFREE( +// CHECK: TPUSH( +// CHECK: TPOP( +// CHECK: TFREE( diff --git a/test/basic/tfree_ops.mlir b/test/basic/tfree_ops.mlir new file mode 100644 index 00000000..cffc8457 --- /dev/null +++ b/test/basic/tfree_ops.mlir @@ -0,0 +1,34 @@ +// RUN: ptoas --pto-arch=a5 %s | FileCheck %s + +module { + func.func @tfree_basic( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + pto.tfree(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + return + } +} + +// CHECK: __global__ AICORE void tfree_basic +// CHECK: TPOP( +// CHECK: TFREE( diff --git a/test/basic/tfree_verify.mlir b/test/basic/tfree_verify.mlir new file mode 100644 index 00000000..5589abb0 --- /dev/null +++ b/test/basic/tfree_verify.mlir @@ -0,0 +1,20 @@ +// RUN: not ptoas %s 2>&1 | FileCheck %s + +module { + func.func @tfree_outside_section( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + // tfree outside section — should fail + pto.tfree(%pipe : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + return + } +} + +// CHECK: error: 'pto.tfree' op must be inside a section.cube or section.vector diff --git a/test/basic/tpush_tpop_fifo_emitc.mlir b/test/basic/tpush_tpop_fifo_emitc.mlir new file mode 100644 index 00000000..c1eed0a1 --- /dev/null +++ b/test/basic/tpush_tpop_fifo_emitc.mlir @@ -0,0 +1,47 @@ +// RUN: ptoas %s | FileCheck %s + +module { + func.func @pipe_emitc( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %acc_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + return + } +} + +// CHECK-NOT: initialize_pipe< +// CHECK-NOT: initialize_pipe( +// CHECK-NOT: memref< +// CHECK-NOT: CrossCoreFIFO_ +// CHECK-NOT: PTOASPipeHandle +// CHECK: auto {{.*}} = TPipe<0, FIFOType::GM_FIFO +// CHECK-SAME: Tile +// CHECK: Tile>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %acc_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %cube_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %cube_tile = pto.bind_tile %cube_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe_c2v = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + %pipe_v2c = pto.initialize_pipe {dir_mask = 2, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<64x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe_c2v : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + pto.tpop(%cube_tile, %pipe_v2c : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + pto.tfree(%pipe_v2c : !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + + pto.section.vector { + pto.tpop(%vec_tile, %pipe_c2v : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + pto.tfree(%pipe_c2v : !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + pto.tpush(%vec_tile, %pipe_v2c : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + return + } +} + +// CHECK: __global__ AICORE void pipe_emitc_a5_dirmask +// CHECK-NOT: initialize_pipe< +// CHECK-NOT: initialize_pipe( +// CHECK: auto {{.*}} = TPipe<0, FIFOType::VEC_FIFO +// CHECK-SAME: Tile +// CHECK: auto {{.*}} = TPipe<2, FIFOType::MAT_FIFO +// CHECK-SAME: Tile +// CHECK-DAG: TPUSH( +// CHECK-DAG: TPOP( +// CHECK-DAG: TFREE( diff --git a/test/basic/tpush_tpop_fifo_ops.mlir b/test/basic/tpush_tpop_fifo_ops.mlir new file mode 100644 index 00000000..d61a35f6 --- /dev/null +++ b/test/basic/tpush_tpop_fifo_ops.mlir @@ -0,0 +1,46 @@ +// RUN: ptoas %s | FileCheck %s + +module { + func.func @pipe_ops( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + + %acc_mem = memref.alloc() : memref<64x128xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<32x128xf32, #pto.address_space> + %acc_tile = pto.bind_tile %acc_mem, %c64, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<64x128xf32, #pto.address_space> -> memref<64x128xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c32, %c128 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x128xf32, #pto.address_space> -> memref<32x128xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<32x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<32x128xf32, #pto.address_space>, !pto.pipe>, memref<32x128xf32, #pto.address_space>>) + } + + return + } +} + +// CHECK: __global__ AICORE void pipe_ops +// CHECK: TPipe< +// CHECK-NOT: initialize_pipe< +// CHECK-NOT: initialize_pipe( +// CHECK: Tile&1 | FileCheck %s + +module { + func.func @verify_bidirectional_dir_mask_not_supported( + %gm_slot_buffer: memref<64x128xf32, #pto.address_space>, + %acc_tile: memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %pipe = pto.initialize_pipe {dir_mask = 3, slot_size = 1024} + (%gm_slot_buffer : memref<64x128xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<64x128xf32, #pto.address_space>> + + pto.section.cube { + pto.tpush(%acc_tile, %pipe : memref<64x128xf32, #pto.address_space>, !pto.pipe>, memref<64x128xf32, #pto.address_space>>) + } + return + } +} + +// CHECK: error: 'pto.initialize_pipe' op bidirectional DIRMASK is not supported diff --git a/test/samples/BindTile/bind_tile_wrap_pointer_cast.py b/test/samples/BindTile/bind_tile_wrap_pointer_cast.py new file mode 100644 index 00000000..d5ccbcd6 --- /dev/null +++ b/test/samples/BindTile/bind_tile_wrap_pointer_cast.py @@ -0,0 +1,36 @@ +def main() -> None: + # Late-stage IR: explicit UB addresses via pointer_cast. This is used to + # regression-test that plain `pto.bind_tile` does *not* lower to `TRESHAPE` + # (which is reserved for SSA `pto.treshape` view semantics). + print( + r""" +module { + func.func @bind_tile_wrap_pointer_cast() { + %c0_i64 = arith.constant 0 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c32 = arith.constant 32 : index + %cst = arith.constant 3.14 : f32 + + %a_mem = pto.pointer_cast(%c0_i64) : memref<32x32xf32, #pto.address_space> + %b_mem = pto.pointer_cast(%c4096_i64) : memref<32x32xf32, #pto.address_space> + + %a_tile = pto.bind_tile %a_mem, %c32, %c32 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x32xf32, #pto.address_space> -> memref<32x32xf32, #pto.address_space> + + %b_tile = pto.bind_tile %b_mem, %c32, %c32 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<32x32xf32, #pto.address_space> -> memref<32x32xf32, #pto.address_space> + + pto.tadds ins(%a_tile, %cst : memref<32x32xf32, #pto.address_space>, f32) + outs(%b_tile : memref<32x32xf32, #pto.address_space>) + return + } +} +""" + ) + + +if __name__ == "__main__": + main() + diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 8bd54ebd..6da66a53 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -259,8 +259,15 @@ process_one_dir() { pto_input="$decoded_pto" fi + local -a per_sample_flags=() + # This sample builds a late-stage IR with explicit pointer_cast addresses, + # which only assembles at Level-3 (skip PlanMemory). + if [[ "$base" == "bind_tile_wrap_pointer_cast" ]]; then + per_sample_flags+=(--pto-level=level3) + fi + # Write output via -o to avoid mixing debug prints with generated C++. - local -a ptoas_cmd=("${ptoas_cmd_base[@]}" "$pto_input" -o "$cpp") + local -a ptoas_cmd=("${ptoas_cmd_base[@]}" "${per_sample_flags[@]}" "$pto_input" -o "$cpp") if ! "${ptoas_cmd[@]}" >/dev/null 2>&1; then if [[ $expect_fail -eq 1 ]]; then echo -e "${A}(${base}.py)\tXFAIL\tptoas failed as expected" @@ -407,6 +414,27 @@ process_one_dir() { fi fi + # Regression guard: plain pto.bind_tile (no view semantics) must lower via + # address binding, not `TRESHAPE(dst, src)`. Otherwise users see spurious + # reshape instructions and pto-isa static checks may reject the kernel. + if [[ "$base" == "bind_tile_wrap_pointer_cast" ]]; then + if grep -Fq "TRESHAPE(" "$cpp"; then + echo -e "${A}(${base}.py) FAIL unexpected TRESHAPE() lowering for plain bind_tile" + overall=1 + continue + fi + if [[ $(grep -c "TASSIGN(" "$cpp") -lt 2 ]]; then + echo -e "${A}(${base}.py) FAIL expected at least 2 TASSIGN() calls" + overall=1 + continue + fi + if ! grep -Fq "TADDS(" "$cpp"; then + echo -e "${A}(${base}.py) FAIL missing TADDS() lowering" + overall=1 + continue + fi + fi + if [[ "$base" == "bitcast_dtype_alias" ]]; then if ! grep -Eq "Tile<[^>]*, int32_t," "$cpp"; then echo -e "${A}(${base}.py) FAIL missing int32_t Tile declaration for pto.bitcast" diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/README.md b/test/tmp_tpush_tpop_matmul_pipe_a5/README.md new file mode 100644 index 00000000..6d86fc6b --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/README.md @@ -0,0 +1,47 @@ +# A5 临时样例: TMATMUL + TPUSH/TPOP(UB) + TPRINT + +这个目录是临时验证用例,可直接删除。 + +## 目录说明 + +- `kernel.mlir` + - PTOAS 源 IR,表达目标链路: + - Cube: `tload(gm->mat)` -> `tmov(mat->left/right)` -> `tmatmul` -> `tpush(acc, pipe)` + - Vector: `tpop(vec, pipe)` -> `tprint` +- `kernel_a5_manual.cpp` + - 可运行的 A5 kernel C++(保留 `TPRINT`) +- `main.cpp` + - Host 侧 ACL 启动代码,构造输入并拉起 kernel + - 输入设为 `A=单位阵`、`B=全1矩阵`,便于按 `TPRINT` 结果快速验算 + +## 1. 用 PTOAS 检查 MLIR + +在仓库根目录执行: + +```bash +./build/tools/ptoas/ptoas --pto-arch=a5 test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir > /tmp/tpush_tpop_matmul_pipe.out +``` + +可提取 emitc 末尾 C++: + +```bash +awk 'BEGIN{emit=0} /^#include "pto\/pto-inst.hpp"/{emit=1} emit{print}' /tmp/tpush_tpop_matmul_pipe.out > /tmp/tpush_tpop_matmul_pipe_emitc.cpp +``` + +说明:当前流水线中,`kernel.mlir` 里的 `pto.tprint` 在部分 pass 组合下可能被提前折叠。 +所以本目录提供了 `kernel_a5_manual.cpp` 来保证运行路径上确实有 `TPRINT`。 + +## 2. 运行方式(参考 pto-isa A5 st 框架) + +将 `kernel_a5_manual.cpp` + `main.cpp` 按你现有 A5 st 工程的 CMake 方式接入并编译。 + +运行后,Vector 段执行 `TPRINT(vecOut)`,日志里应可看到打印输出。 +本样例里理论结果应是全 `1.0`(`A * B = I * Ones = Ones`)。 + +## 3. 参数约定 + +- Tile 形状:`16x16` +- 输入类型:`f32` +- 输出累加类型:`f32` +- `pipe`:`dir_mask = 1`,C2V,A5 下映射 UB `VEC_FIFO` +- `main.cpp` 里 `c2vBuf = 0x10000` 为示例 UB 基址 diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir new file mode 100644 index 00000000..71cc38f3 --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel.mlir @@ -0,0 +1,67 @@ +module { + func.func @matmul_tpush_tpop_print( + %gm_a: memref<16x16xf32, #pto.address_space>, + %gm_b: memref<16x16xf32, #pto.address_space>, + %gm_slot_buffer: memref<16x16xf32, #pto.address_space>, + %c2v_consumer_buf: i32, + %v2c_consumer_buf: i32) { + %c16 = arith.constant 16 : index + + %mat_a_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %mat_b_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %left_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %right_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %acc_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + %vec_mem = memref.alloc() : memref<16x16xf32, #pto.address_space> + + // TileBuf-only tprint path (currently may be folded out by some lowering passes). + %vec_print = pto.alloc_tile : !pto.tile_buf + + %mat_a_tile = pto.bind_tile %mat_a_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %mat_b_tile = pto.bind_tile %mat_b_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %left_tile = pto.bind_tile %left_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %right_tile = pto.bind_tile %right_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %acc_tile = pto.bind_tile %acc_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=1024, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + %vec_tile = pto.bind_tile %vec_mem, %c16, %c16 { + config = #pto.tile_buf_config, slayout=#pto.slayout, s_fractal_size=512, pad=#pto.pad_value> + } : memref<16x16xf32, #pto.address_space> -> memref<16x16xf32, #pto.address_space> + + %pipe = pto.initialize_pipe {dir_mask = 1, slot_size = 1024} + (%gm_slot_buffer : memref<16x16xf32, #pto.address_space>, + %c2v_consumer_buf : i32, + %v2c_consumer_buf : i32) + -> !pto.pipe>, memref<16x16xf32, #pto.address_space>> + + pto.section.cube { + pto.tload ins(%gm_a : memref<16x16xf32, #pto.address_space>) outs(%mat_a_tile : memref<16x16xf32, #pto.address_space>) + pto.tload ins(%gm_b : memref<16x16xf32, #pto.address_space>) outs(%mat_b_tile : memref<16x16xf32, #pto.address_space>) + + pto.tmov ins(%mat_a_tile : memref<16x16xf32, #pto.address_space>) outs(%left_tile : memref<16x16xf32, #pto.address_space>) + pto.tmov ins(%mat_b_tile : memref<16x16xf32, #pto.address_space>) outs(%right_tile : memref<16x16xf32, #pto.address_space>) + + pto.tmatmul ins(%left_tile, %right_tile : memref<16x16xf32, #pto.address_space>, memref<16x16xf32, #pto.address_space>) outs(%acc_tile : memref<16x16xf32, #pto.address_space>) + + pto.tpush(%acc_tile, %pipe : memref<16x16xf32, #pto.address_space>, !pto.pipe>, memref<16x16xf32, #pto.address_space>>) + } + + pto.section.vector { + pto.tpop(%vec_tile, %pipe : memref<16x16xf32, #pto.address_space>, !pto.pipe>, memref<16x16xf32, #pto.address_space>>) + pto.tmov ins(%vec_tile : memref<16x16xf32, #pto.address_space>) outs(%vec_print : !pto.tile_buf) + pto.tprint ins(%vec_print : !pto.tile_buf) + } + + return + } +} diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp new file mode 100644 index 00000000..3e9cf987 --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/kernel_a5_manual.cpp @@ -0,0 +1,78 @@ +#include "pto/pto-inst.hpp" + +using namespace pto; + +__global__ AICORE void matmul_tpush_tpop_print(__gm__ float *gm_a, __gm__ float *gm_b, + __gm__ float *gm_slot_buffer, + int32_t c2v_consumer_buf, + int32_t v2c_consumer_buf) +{ + (void)gm_slot_buffer; + (void)v2c_consumer_buf; + + int64_t base0 = 0; + int64_t base512 = 512; + + Tile matA; + TASSIGN(matA, base0); + Tile matB; + TASSIGN(matB, base512); + + Tile leftA; + TASSIGN(leftA, base0); + Tile rightB; + TASSIGN(rightB, base0); + + Tile accC; + TASSIGN(accC, base0); + Tile vecOut; + TASSIGN(vecOut, base0); + + auto pipe = TPipe<0, FIFOType::VEC_FIFO, 8, 8, + Tile, + Tile, + TSyncOpType::TMOV_C2UB, TSyncOpType::NONE, VecCubeRatio::V2C1_VECS>(c2v_consumer_buf); + +#if defined(__DAV_CUBE__) + using GTShape = pto::Shape<1, 1, 1, 16, 16>; + using GTStride = pto::Stride<256, 256, 256, 16, 1>; + using GlobalFloat = GlobalTensor; + + GTShape shape = GTShape(); + GTStride stride = GTStride(); + + GlobalFloat gA(gm_a, shape, stride); + GlobalFloat gB(gm_b, shape, stride); + + TLOAD(matA, gA); + TLOAD(matB, gB); + TMOV(leftA, matA); + TMOV(rightB, matB); + TMATMUL(accC, leftA, rightB); + TPUSH(accC, pipe); +#endif + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + TPOP(vecOut, pipe); + TPRINT(vecOut); + TFREE(pipe); +#endif +} + +template +void LaunchMatmulTPushPopPrint(uint8_t *a, uint8_t *b, uint8_t *slot, + int32_t c2vBuf, int32_t v2cBuf, void *stream) +{ + (void)tilingKey; + matmul_tpush_tpop_print<<<1, nullptr, stream>>>(reinterpret_cast(a), + reinterpret_cast(b), + reinterpret_cast(slot), + c2vBuf, v2cBuf); +} + +template void LaunchMatmulTPushPopPrint<1>(uint8_t *a, uint8_t *b, uint8_t *slot, + int32_t c2vBuf, int32_t v2cBuf, void *stream); diff --git a/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp b/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp new file mode 100644 index 00000000..353bfabf --- /dev/null +++ b/test/tmp_tpush_tpop_matmul_pipe_a5/main.cpp @@ -0,0 +1,70 @@ +#include "acl/acl.h" +#include +#include +#include +#include + +template +void LaunchMatmulTPushPopPrint(uint8_t *a, uint8_t *b, uint8_t *slot, + int32_t c2vBuf, int32_t v2cBuf, void *stream); + +#define ACL_CHECK(expr) \ + do { \ + aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ACL ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ + return 1; \ + } \ + } while (0) + +int main() +{ + constexpr int M = 16; + constexpr int K = 16; + constexpr int N = 16; + constexpr size_t aBytes = M * K * sizeof(float); + constexpr size_t bBytes = K * N * sizeof(float); + constexpr size_t slotBytes = M * N * sizeof(float); + + std::vector hostA(M * K, 0.0f); + std::vector hostB(K * N, 1.0f); + std::vector hostSlot(M * N, 0.0f); + for (int i = 0; i < M; ++i) { + hostA[i * K + i] = 1.0f; // A = I + } + + ACL_CHECK(aclInit(nullptr)); + ACL_CHECK(aclrtSetDevice(0)); + + aclrtStream stream = nullptr; + ACL_CHECK(aclrtCreateStream(&stream)); + + uint8_t *devA = nullptr; + uint8_t *devB = nullptr; + uint8_t *devSlot = nullptr; + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devA), aBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devB), bBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(reinterpret_cast(&devSlot), slotBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(devA, aBytes, hostA.data(), aBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devB, bBytes, hostB.data(), bBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(devSlot, slotBytes, hostSlot.data(), slotBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + + // For A5 C2V VEC_FIFO path, c2vBuf is local UB base address (example value). + constexpr int32_t c2vBuf = 0x10000; + constexpr int32_t v2cBuf = 0; + + LaunchMatmulTPushPopPrint<1>(devA, devB, devSlot, c2vBuf, v2cBuf, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + + std::puts("Kernel finished. Expect TPRINT output to be all 1.0 (A=I, B=all-ones)."); + + aclrtFree(devA); + aclrtFree(devB); + aclrtFree(devSlot); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); + + return 0; +} diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index 030826b5..fc82e13f 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -679,6 +679,11 @@ int main(int argc, char **argv) { } module->getOperation()->setAttr("pto.target_arch", mlir::StringAttr::get(&context, arch)); + + // Insert pto.tfree at earliest safe point (A5 only, after PlanMemory). + if (arch == "a5") + pm.addNestedPass(pto::createPTOInsertTFreePass()); + if (arch == "a3") { pm.addPass(pto::createEmitPTOManualPass(pto::PTOArch::A3)); } else {