diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index c967a75e..28b1c85d 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -100,6 +100,7 @@ def AddPtrOp : PTO_Op<"addptr", [ //===----------------------------------------------------------------------===// def LoadScalarOp : PTO_Op<"load_scalar", [ + OpPipeInterface, DeclareOpInterfaceMethods ]> { let summary = "Load a single scalar element from a pointer at offset."; @@ -116,9 +117,14 @@ def LoadScalarOp : PTO_Op<"load_scalar", [ let assemblyFormat = [{ $ptr `[` $offset `]` attr-dict `:` type($ptr) `->` type($value) }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; } + }]; } def StoreScalarOp : PTO_Op<"store_scalar", [ + OpPipeInterface, DeclareOpInterfaceMethods ]> { let summary = "Store a single scalar element to a pointer at offset."; @@ -136,6 +142,10 @@ def StoreScalarOp : PTO_Op<"store_scalar", [ let assemblyFormat = [{ $value `,` $ptr `[` $offset `]` attr-dict `:` type($ptr) `,` type($value) }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; } + }]; } def MakeTensorViewOp : PTO_Op<"make_tensor_view", [AttrSizedOperandSegments]> { diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index a33bf889..b91e9f68 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -1,4 +1,5 @@ #include "PTO/Transforms/InsertSync/InsertSyncAnalysis.h" +#include "PTO/IR/PTO.h" #include "PTO/Transforms/InsertSync/SyncCommon.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -121,9 +122,9 @@ bool InsertSyncAnalysis::IsNoNeedToInsertSync( const PipelineType frontPipe = frontCompound->kPipeValue; const PipelineType nowPipe = nowCompound->kPipeValue; - if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) { + // Scalar pipe is in-order on target hardware; skip same-pipe sync. + if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) return true; - } if (nowCompound->elementOp == frontCompound->elementOp && !isBackwardDep) { return true; diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp index 33aec28b..b5e3b463 100644 --- a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp +++ b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp @@ -7,6 +7,7 @@ #include "llvm/Support/FormatVariadic.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Matchers.h" +#include // [P0 新增] 引入副作用接口和 PTO 接口 #include "mlir/Interfaces/SideEffectInterfaces.h" @@ -15,18 +16,41 @@ using namespace mlir; using namespace mlir::pto; +static int64_t getElementSizeInBytes(Type elemType) { + if (auto intTy = dyn_cast(elemType)) { + return std::max(1, intTy.getWidth() / 8); + } + if (auto floatTy = dyn_cast(elemType)) { + return std::max(1, floatTy.getWidth() / 8); + } + if (isa(elemType)) { + return 8; + } + return 1; +} + // [辅助函数] 尝试从 Operation 中计算相对于 Source 的字节偏移量和新大小 // 返回值: pair // 如果无法计算静态值,返回 {-1, -1} 表示这是动态的 static std::pair getStaticOffsetAndSize(Operation *op, Value src) { - auto srcType = dyn_cast(src.getType()); - if (!srcType) return {0, 0}; - - int64_t elemSize = srcType.getElementType().getIntOrFloatBitWidth() / 8; - if (elemSize == 0) elemSize = 1; + Type srcElemType = nullptr; + if (auto srcType = dyn_cast(src.getType())) { + srcElemType = srcType.getElementType(); + } else if (auto ptrType = dyn_cast(src.getType())) { + srcElemType = ptrType.getElementType(); + } else { + return {0, 0}; + } + + const int64_t elemSize = getElementSizeInBytes(srcElemType); // === Case 1: memref.subview === if (auto subView = dyn_cast(op)) { + auto srcType = dyn_cast(src.getType()); + if (!srcType) { + return {-1, -1}; + } + int64_t baseOffset; SmallVector strides; if (failed(mlir::getStridesAndOffset(srcType, strides, baseOffset))) { @@ -71,6 +95,15 @@ static std::pair getStaticOffsetAndSize(Operation *op, Value s } return {staticOffsets[0] * elemSize, 0}; } + + // === Case 3: pto.addptr === + if (auto addPtrOp = dyn_cast(op)) { + llvm::APInt apIntValue; + if (!matchPattern(addPtrOp.getOffset(), m_ConstantInt(&apIntValue))) { + return {-1, -1}; + } + return {apIntValue.getSExtValue() * elemSize, 0}; + } return {0, 0}; } @@ -138,6 +171,9 @@ void PTOIRTranslator::RecursionIR(Region *region) { else if (auto bindTileOp = dyn_cast(op)) { UpdateAliasBufferInfo(bindTileOp.getResult(), bindTileOp.getSource()); } + else if (auto addPtrOp = dyn_cast(op)) { + UpdateAliasBufferInfo(addPtrOp.getResult(), addPtrOp.getPtr()); + } else if (auto subViewOp = dyn_cast(op)) { UpdateAliasBufferInfo(subViewOp.getResult(), subViewOp.getSource()); } @@ -496,28 +532,38 @@ void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) { if (!buffer2MemInfoMap_.contains(source)) return; int64_t deltaOffset = 0; - int64_t newSize = -1; + int64_t newSize = -1; + bool hasUnknownAliasRange = false; if (auto op = result.getDefiningOp()) { auto info = getStaticOffsetAndSize(op, source); - if (info.first != -1) { - deltaOffset = info.first; - if (info.second > 0) newSize = info.second; - } + if (info.first == -1) { + hasUnknownAliasRange = true; + } else { + deltaOffset = info.first; + if (info.second > 0) newSize = info.second; + } } auto &resultMemInfoVec = buffer2MemInfoMap_[result]; for (auto &parentInfo : buffer2MemInfoMap_[source]) { auto newInfo = parentInfo->clone(result); - - if (!newInfo->baseAddresses.empty()) { - newInfo->baseAddresses[0] += deltaOffset; + + if (hasUnknownAliasRange) { + // Dynamic pointer arithmetic cannot be modeled precisely here. + // Keep root/scope aliasing, but drop concrete range info conservatively. + newInfo->baseAddresses.clear(); + newInfo->allocateSize = 0; } else { + if (!newInfo->baseAddresses.empty()) { + newInfo->baseAddresses[0] += deltaOffset; + } else { newInfo->baseAddresses.push_back(deltaOffset); + } } - - if (newSize > 0) { + + if (!hasUnknownAliasRange && newSize > 0) { newInfo->allocateSize = newSize; } diff --git a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py new file mode 100644 index 00000000..47310dc3 --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py @@ -0,0 +1,66 @@ +from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType +from mlir.dialects import func, arith, pto + + +def _idx_const(v: int): + return arith.ConstantOp(IndexType.get(), v).result + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f32 = F32Type.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + tv2 = pto.TensorViewType.get(2, f32, ctx) + tile_view = pto.PartitionTensorViewType.get([16, 16], f32, ctx) + + vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) + bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) + sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) + pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) + cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx) + tile_buf = pto.TileBufType.get([16, 16], f32, vec, [16, 16], cfg, ctx) + + fn_ty = func.FunctionType.get([ptr_f32, ptr_f32], []) + with InsertionPoint(m.body): + fn = func.FuncOp("test_scalar_cross_pipe", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + src, dst = entry.arguments + c0 = _idx_const(0) + c1 = _idx_const(1) + c4 = _idx_const(4) + c16 = _idx_const(16) + + src_tv = pto.MakeTensorViewOp(tv2, src, [c16, c16], [c16, c1]).result + dst_tv = pto.MakeTensorViewOp(tv2, dst, [c16, c16], [c16, c1]).result + src_part = pto.PartitionViewOp( + tile_view, src_tv, offsets=[c0, c0], sizes=[c16, c16] + ).result + dst_part = pto.PartitionViewOp( + tile_view, dst_tv, offsets=[c0, c0], sizes=[c16, c16] + ).result + ub = pto.AllocTileOp(tile_buf).result + + src_off = pto.addptr(src, c4) + dst_off = pto.addptr(dst, c4) + one = arith.ConstantOp(f32, 1.0).result + pto.TLoadOp(None, src_part, ub) + pto.store_scalar(src_off, c0, one) + loaded = pto.load_scalar(f32, dst_off, c0) + pto.store_scalar(dst_off, c1, loaded) + pto.TStoreOp(None, ub, dst_part) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py new file mode 100644 index 00000000..6c61ae26 --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py @@ -0,0 +1,39 @@ +from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType +from mlir.dialects import func, arith, pto + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f32 = F32Type.get(ctx) + idx = IndexType.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + + fn_ty = func.FunctionType.get([ptr_f32], []) + with InsertionPoint(m.body): + fn = func.FuncOp("test_scalar_intra_pipe_barrier", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + ptr = entry.arguments[0] + c0 = arith.ConstantOp(idx, 0).result + c4 = arith.ConstantOp(idx, 4).result + one = arith.ConstantOp(f32, 1.0).result + two = arith.ConstantOp(f32, 2.0).result + + ptr_off = pto.addptr(ptr, c4) + pto.store_scalar(ptr_off, c0, one) + pto.store_scalar(ptr_off, c0, two) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index ef71560d..933bfb2f 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -292,6 +292,66 @@ process_one_dir() { fi fi + # Scalar sync regression: scalar load/store should participate in PIPE_S + # auto-sync and correctly connect with supported DMA directions. + if [[ "$base" == "test_inject_sync_scalar_cross_pipe" ]]; then + if ! grep -Eq "set_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE2->PIPE_S" + overall=1 + continue + fi + if ! grep -Eq "wait_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE2->PIPE_S" + overall=1 + continue + fi + if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE3" + overall=1 + continue + fi + if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE3" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S->PIPE_MTE2 event in scalar cross-pipe case" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_MTE3->PIPE_S event in scalar cross-pipe case" + overall=1 + continue + fi + fi + + # Scalar intra-pipe regression: PIPE_S local dependency should not inject + # extra sync (PIPE_S is in-order); only function-tail PIPE_ALL remains. + if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then + local bar_all_cnt + bar_all_cnt="$(grep -Fc "pipe_barrier(PIPE_ALL)" "$cpp" || true)" + if grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected pipe_barrier(PIPE_S) for scalar intra-pipe dependency" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S<->PIPE_S event sync for scalar intra-pipe dependency" + overall=1 + continue + fi + if [[ "${bar_all_cnt}" -ne 1 ]]; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_ALL barrier count=${bar_all_cnt} (expect 1 tail barrier)" + overall=1 + continue + fi + fi + # Regression guard for issue #185: barrier_sync must support op types # beyond TMATMUL/TVEC and lower to the expected per-pipe barrier. if [[ "$base" == "test_barrier_sync" ]]; then