From 8aced45e772bb09b0ed78b0809d07aae805b1ac6 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 11:01:42 +0800 Subject: [PATCH 1/4] [InsertSync] add PIPE_S scalar sync and addptr alias-through --- include/PTO/IR/PTOOps.td | 10 +++ .../InsertSync/InsertSyncAnalysis.cpp | 11 ++- .../Transforms/InsertSync/PTOIRTranslator.cpp | 76 +++++++++++++++---- .../test_inject_sync_scalar_cross_pipe.py | 68 +++++++++++++++++ ...t_inject_sync_scalar_intra_pipe_barrier.py | 39 ++++++++++ test/samples/runop.sh | 35 +++++++++ 6 files changed, 223 insertions(+), 16 deletions(-) create mode 100644 test/samples/Sync/test_inject_sync_scalar_cross_pipe.py create mode 100644 test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index c967a75e..28b1c85d 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -100,6 +100,7 @@ def AddPtrOp : PTO_Op<"addptr", [ //===----------------------------------------------------------------------===// def LoadScalarOp : PTO_Op<"load_scalar", [ + OpPipeInterface, DeclareOpInterfaceMethods ]> { let summary = "Load a single scalar element from a pointer at offset."; @@ -116,9 +117,14 @@ def LoadScalarOp : PTO_Op<"load_scalar", [ let assemblyFormat = [{ $ptr `[` $offset `]` attr-dict `:` type($ptr) `->` type($value) }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; } + }]; } def StoreScalarOp : PTO_Op<"store_scalar", [ + OpPipeInterface, DeclareOpInterfaceMethods ]> { let summary = "Store a single scalar element to a pointer at offset."; @@ -136,6 +142,10 @@ def StoreScalarOp : PTO_Op<"store_scalar", [ let assemblyFormat = [{ $value `,` $ptr `[` $offset `]` attr-dict `:` type($ptr) `,` type($value) }]; + + let extraClassDeclaration = [{ + ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; } + }]; } def MakeTensorViewOp : PTO_Op<"make_tensor_view", [AttrSizedOperandSegments]> { diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index a33bf889..e236f3c4 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -1,4 +1,5 @@ #include "PTO/Transforms/InsertSync/InsertSyncAnalysis.h" +#include "PTO/IR/PTO.h" #include "PTO/Transforms/InsertSync/SyncCommon.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -23,6 +24,10 @@ static bool isValidPipeIndex(PipelineType pipe) { return static_cast(pipe) < kPipeStateSize; } +static bool isScalarMemoryOp(Operation *op) { + return isa(op); +} + // ============================================================================== // 1. Entry Point // ============================================================================== @@ -122,7 +127,11 @@ bool InsertSyncAnalysis::IsNoNeedToInsertSync( const PipelineType nowPipe = nowCompound->kPipeValue; if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) { - return true; + Operation *nowOp = nowCompound->elementOp; + Operation *frontOp = frontCompound->elementOp; + if (!isScalarMemoryOp(nowOp) && !isScalarMemoryOp(frontOp)) { + return true; + } } if (nowCompound->elementOp == frontCompound->elementOp && !isBackwardDep) { diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp index 33aec28b..b5e3b463 100644 --- a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp +++ b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp @@ -7,6 +7,7 @@ #include "llvm/Support/FormatVariadic.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Matchers.h" +#include // [P0 新增] 引入副作用接口和 PTO 接口 #include "mlir/Interfaces/SideEffectInterfaces.h" @@ -15,18 +16,41 @@ using namespace mlir; using namespace mlir::pto; +static int64_t getElementSizeInBytes(Type elemType) { + if (auto intTy = dyn_cast(elemType)) { + return std::max(1, intTy.getWidth() / 8); + } + if (auto floatTy = dyn_cast(elemType)) { + return std::max(1, floatTy.getWidth() / 8); + } + if (isa(elemType)) { + return 8; + } + return 1; +} + // [辅助函数] 尝试从 Operation 中计算相对于 Source 的字节偏移量和新大小 // 返回值: pair // 如果无法计算静态值,返回 {-1, -1} 表示这是动态的 static std::pair getStaticOffsetAndSize(Operation *op, Value src) { - auto srcType = dyn_cast(src.getType()); - if (!srcType) return {0, 0}; - - int64_t elemSize = srcType.getElementType().getIntOrFloatBitWidth() / 8; - if (elemSize == 0) elemSize = 1; + Type srcElemType = nullptr; + if (auto srcType = dyn_cast(src.getType())) { + srcElemType = srcType.getElementType(); + } else if (auto ptrType = dyn_cast(src.getType())) { + srcElemType = ptrType.getElementType(); + } else { + return {0, 0}; + } + + const int64_t elemSize = getElementSizeInBytes(srcElemType); // === Case 1: memref.subview === if (auto subView = dyn_cast(op)) { + auto srcType = dyn_cast(src.getType()); + if (!srcType) { + return {-1, -1}; + } + int64_t baseOffset; SmallVector strides; if (failed(mlir::getStridesAndOffset(srcType, strides, baseOffset))) { @@ -71,6 +95,15 @@ static std::pair getStaticOffsetAndSize(Operation *op, Value s } return {staticOffsets[0] * elemSize, 0}; } + + // === Case 3: pto.addptr === + if (auto addPtrOp = dyn_cast(op)) { + llvm::APInt apIntValue; + if (!matchPattern(addPtrOp.getOffset(), m_ConstantInt(&apIntValue))) { + return {-1, -1}; + } + return {apIntValue.getSExtValue() * elemSize, 0}; + } return {0, 0}; } @@ -138,6 +171,9 @@ void PTOIRTranslator::RecursionIR(Region *region) { else if (auto bindTileOp = dyn_cast(op)) { UpdateAliasBufferInfo(bindTileOp.getResult(), bindTileOp.getSource()); } + else if (auto addPtrOp = dyn_cast(op)) { + UpdateAliasBufferInfo(addPtrOp.getResult(), addPtrOp.getPtr()); + } else if (auto subViewOp = dyn_cast(op)) { UpdateAliasBufferInfo(subViewOp.getResult(), subViewOp.getSource()); } @@ -496,28 +532,38 @@ void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) { if (!buffer2MemInfoMap_.contains(source)) return; int64_t deltaOffset = 0; - int64_t newSize = -1; + int64_t newSize = -1; + bool hasUnknownAliasRange = false; if (auto op = result.getDefiningOp()) { auto info = getStaticOffsetAndSize(op, source); - if (info.first != -1) { - deltaOffset = info.first; - if (info.second > 0) newSize = info.second; - } + if (info.first == -1) { + hasUnknownAliasRange = true; + } else { + deltaOffset = info.first; + if (info.second > 0) newSize = info.second; + } } auto &resultMemInfoVec = buffer2MemInfoMap_[result]; for (auto &parentInfo : buffer2MemInfoMap_[source]) { auto newInfo = parentInfo->clone(result); - - if (!newInfo->baseAddresses.empty()) { - newInfo->baseAddresses[0] += deltaOffset; + + if (hasUnknownAliasRange) { + // Dynamic pointer arithmetic cannot be modeled precisely here. + // Keep root/scope aliasing, but drop concrete range info conservatively. + newInfo->baseAddresses.clear(); + newInfo->allocateSize = 0; } else { + if (!newInfo->baseAddresses.empty()) { + newInfo->baseAddresses[0] += deltaOffset; + } else { newInfo->baseAddresses.push_back(deltaOffset); + } } - - if (newSize > 0) { + + if (!hasUnknownAliasRange && newSize > 0) { newInfo->allocateSize = newSize; } diff --git a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py new file mode 100644 index 00000000..306642dd --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py @@ -0,0 +1,68 @@ +from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType +from mlir.dialects import func, arith, pto + + +def _idx_const(v: int): + return arith.ConstantOp(IndexType.get(), v).result + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f32 = F32Type.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + tv2 = pto.TensorViewType.get(2, f32, ctx) + tile_view = pto.PartitionTensorViewType.get([16, 16], f32, ctx) + + vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) + bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) + sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) + pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) + cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx) + tile_buf = pto.TileBufType.get([16, 16], f32, vec, [16, 16], cfg, ctx) + + fn_ty = func.FunctionType.get([ptr_f32, ptr_f32], []) + with InsertionPoint(m.body): + fn = func.FuncOp("test_scalar_cross_pipe", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + src, dst = entry.arguments + c0 = _idx_const(0) + c1 = _idx_const(1) + c4 = _idx_const(4) + c16 = _idx_const(16) + + src_tv = pto.MakeTensorViewOp(tv2, src, [c16, c16], [c16, c1]).result + dst_tv = pto.MakeTensorViewOp(tv2, dst, [c16, c16], [c16, c1]).result + src_part = pto.PartitionViewOp( + tile_view, src_tv, offsets=[c0, c0], sizes=[c16, c16] + ).result + dst_part = pto.PartitionViewOp( + tile_view, dst_tv, offsets=[c0, c0], sizes=[c16, c16] + ).result + ub = pto.AllocTileOp(tile_buf).result + + src_off = pto.addptr(src, c4) + one = arith.ConstantOp(f32, 1.0).result + pto.store_scalar(src_off, c0, one) + + pto.TLoadOp(None, src_part, ub) + pto.TStoreOp(None, ub, dst_part) + + dst_off = pto.addptr(dst, c4) + loaded = pto.load_scalar(f32, dst_off, c0) + pto.store_scalar(dst_off, c1, loaded) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py new file mode 100644 index 00000000..6c61ae26 --- /dev/null +++ b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py @@ -0,0 +1,39 @@ +from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType +from mlir.dialects import func, arith, pto + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f32 = F32Type.get(ctx) + idx = IndexType.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + + fn_ty = func.FunctionType.get([ptr_f32], []) + with InsertionPoint(m.body): + fn = func.FuncOp("test_scalar_intra_pipe_barrier", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + ptr = entry.arguments[0] + c0 = arith.ConstantOp(idx, 0).result + c4 = arith.ConstantOp(idx, 4).result + one = arith.ConstantOp(f32, 1.0).result + two = arith.ConstantOp(f32, 2.0).result + + ptr_off = pto.addptr(ptr, c4) + pto.store_scalar(ptr_off, c0, one) + pto.store_scalar(ptr_off, c0, two) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index ef71560d..60ea4688 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -292,6 +292,41 @@ process_one_dir() { fi fi + # Scalar sync regression: scalar store/load should participate in PIPE_S + # auto-sync and correctly connect with DMA pipelines. + if [[ "$base" == "test_inject_sync_scalar_cross_pipe" ]]; then + if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE2" + overall=1 + continue + fi + if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE2" + overall=1 + continue + fi + if ! grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE3->PIPE_S" + overall=1 + continue + fi + if ! grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE3->PIPE_S" + overall=1 + continue + fi + fi + + # Scalar intra-pipe regression: dependent scalar PIPE_S accesses should be + # serialized by a per-pipe barrier. + if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then + if ! grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing pipe_barrier(PIPE_S) for scalar intra-pipe dependency" + overall=1 + continue + fi + fi + # Regression guard for issue #185: barrier_sync must support op types # beyond TMATMUL/TVEC and lower to the expected per-pipe barrier. if [[ "$base" == "test_barrier_sync" ]]; then From 40cee3b694f2789fd4eeb25bc0dd58f053cb578f Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 16:27:36 +0800 Subject: [PATCH 2/4] [InsertSync] align scalar sync tests and add A3-safe PIPE_ALL fallback --- .../InsertSync/InsertSyncAnalysis.cpp | 29 ++++++++++++- .../test_inject_sync_scalar_cross_pipe.py | 8 ++-- test/samples/runop.sh | 42 ++++++++++++------- 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index e236f3c4..92c81f3e 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -28,6 +28,21 @@ static bool isScalarMemoryOp(Operation *op) { return isa(op); } +static bool needsPipeAllBarrier(PipelineType srcPipe, PipelineType dstPipe) { + // A3 runtime is unstable for these scalar synchronization forms: + // 1) PIPE_S local barrier + // 2) PIPE_S -> PIPE_MTE2 + // 3) PIPE_MTE3 -> PIPE_S + // Conservatively fall back to PIPE_ALL barrier to preserve correctness. + if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_S) + return true; + if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_MTE2) + return true; + if (srcPipe == PipelineType::PIPE_MTE3 && dstPipe == PipelineType::PIPE_S) + return true; + return false; +} + // ============================================================================== // 1. Entry Point // ============================================================================== @@ -338,7 +353,19 @@ void InsertSyncAnalysis::InsertSyncOperation( PipelineType nowPipe = nowCompound->kPipeValue; PipelineType frontPipe = frontCompound->kPipeValue; - if (nowPipe == frontPipe) { + if (needsPipeAllBarrier(frontPipe, nowPipe)) { + unsigned insertBarrierId = nowCompound->GetIndex(); + auto barrierOp = std::make_unique( + SyncOperation::TYPE::PIPE_BARRIER, PipelineType::PIPE_ALL, + PipelineType::PIPE_ALL, syncIndex_, insertBarrierId, forEndIndex); + barrierOp->SetDepSyncIRIndex(frontCompound->GetIndex()); + syncIR_[insertBarrierId]->pipeBefore.push_back(barrierOp.get()); + barrierOp->SetSyncIRIndex(insertBarrierId); + + SmallVector> newSync; + newSync.emplace_back(std::move(barrierOp)); + syncOperations_.emplace_back(std::move(newSync)); + } else if (nowPipe == frontPipe) { unsigned insertBarrierId = nowCompound->GetIndex(); auto barrierOp = std::make_unique( SyncOperation::TYPE::PIPE_BARRIER, frontPipe, nowPipe, syncIndex_, diff --git a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py index 306642dd..47310dc3 100644 --- a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py +++ b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py @@ -48,15 +48,13 @@ def build(): ub = pto.AllocTileOp(tile_buf).result src_off = pto.addptr(src, c4) + dst_off = pto.addptr(dst, c4) one = arith.ConstantOp(f32, 1.0).result - pto.store_scalar(src_off, c0, one) - pto.TLoadOp(None, src_part, ub) - pto.TStoreOp(None, ub, dst_part) - - dst_off = pto.addptr(dst, c4) + pto.store_scalar(src_off, c0, one) loaded = pto.load_scalar(f32, dst_off, c0) pto.store_scalar(dst_off, c1, loaded) + pto.TStoreOp(None, ub, dst_part) func.ReturnOp([]) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 60ea4688..896ed443 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -292,36 +292,50 @@ process_one_dir() { fi fi - # Scalar sync regression: scalar store/load should participate in PIPE_S - # auto-sync and correctly connect with DMA pipelines. + # Scalar sync regression: scalar load/store should participate in PIPE_S + # auto-sync and correctly connect with supported DMA directions. if [[ "$base" == "test_inject_sync_scalar_cross_pipe" ]]; then - if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE2" + if ! grep -Eq "set_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE2->PIPE_S" overall=1 continue fi - if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE2" + if ! grep -Eq "wait_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE2->PIPE_S" overall=1 continue fi - if ! grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE3->PIPE_S" + if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE3" overall=1 continue fi - if ! grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE3->PIPE_S" + if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE3" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S->PIPE_MTE2 event in scalar cross-pipe case" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_MTE3->PIPE_S event in scalar cross-pipe case" overall=1 continue fi fi - # Scalar intra-pipe regression: dependent scalar PIPE_S accesses should be - # serialized by a per-pipe barrier. + # Scalar intra-pipe regression: dependent scalar accesses should be + # serialized by an extra safety barrier (beyond the function-tail PIPE_ALL). if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then - if ! grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing pipe_barrier(PIPE_S) for scalar intra-pipe dependency" + local bar_all_cnt + bar_all_cnt="$(grep -Fc "pipe_barrier(PIPE_ALL)" "$cpp" || true)" + if [[ "${bar_all_cnt}" -lt 2 ]]; then + echo -e "${A}(${base}.py)\tFAIL\tmissing extra pipe_barrier(PIPE_ALL) for scalar intra-pipe dependency" overall=1 continue fi From 3938e52f23345db1411f739d3e1d73605d4b5f48 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 17:08:11 +0800 Subject: [PATCH 3/4] [InsertSync] skip S->S sync and align scalar intra-pipe check --- .../InsertSync/InsertSyncAnalysis.cpp | 21 +++++-------------- test/samples/runop.sh | 19 +++++++++++++---- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index 92c81f3e..3bb3b034 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -24,18 +24,11 @@ static bool isValidPipeIndex(PipelineType pipe) { return static_cast(pipe) < kPipeStateSize; } -static bool isScalarMemoryOp(Operation *op) { - return isa(op); -} - static bool needsPipeAllBarrier(PipelineType srcPipe, PipelineType dstPipe) { // A3 runtime is unstable for these scalar synchronization forms: - // 1) PIPE_S local barrier - // 2) PIPE_S -> PIPE_MTE2 - // 3) PIPE_MTE3 -> PIPE_S + // 1) PIPE_S -> PIPE_MTE2 + // 2) PIPE_MTE3 -> PIPE_S // Conservatively fall back to PIPE_ALL barrier to preserve correctness. - if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_S) - return true; if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_MTE2) return true; if (srcPipe == PipelineType::PIPE_MTE3 && dstPipe == PipelineType::PIPE_S) @@ -141,13 +134,9 @@ bool InsertSyncAnalysis::IsNoNeedToInsertSync( const PipelineType frontPipe = frontCompound->kPipeValue; const PipelineType nowPipe = nowCompound->kPipeValue; - if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) { - Operation *nowOp = nowCompound->elementOp; - Operation *frontOp = frontCompound->elementOp; - if (!isScalarMemoryOp(nowOp) && !isScalarMemoryOp(frontOp)) { - return true; - } - } + // Scalar pipe is in-order on target hardware; skip same-pipe sync. + if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) + return true; if (nowCompound->elementOp == frontCompound->elementOp && !isBackwardDep) { return true; diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 896ed443..933bfb2f 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -329,13 +329,24 @@ process_one_dir() { fi fi - # Scalar intra-pipe regression: dependent scalar accesses should be - # serialized by an extra safety barrier (beyond the function-tail PIPE_ALL). + # Scalar intra-pipe regression: PIPE_S local dependency should not inject + # extra sync (PIPE_S is in-order); only function-tail PIPE_ALL remains. if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then local bar_all_cnt bar_all_cnt="$(grep -Fc "pipe_barrier(PIPE_ALL)" "$cpp" || true)" - if [[ "${bar_all_cnt}" -lt 2 ]]; then - echo -e "${A}(${base}.py)\tFAIL\tmissing extra pipe_barrier(PIPE_ALL) for scalar intra-pipe dependency" + if grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected pipe_barrier(PIPE_S) for scalar intra-pipe dependency" + overall=1 + continue + fi + if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \ + grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S<->PIPE_S event sync for scalar intra-pipe dependency" + overall=1 + continue + fi + if [[ "${bar_all_cnt}" -ne 1 ]]; then + echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_ALL barrier count=${bar_all_cnt} (expect 1 tail barrier)" overall=1 continue fi From 6c8c44e6cf641e96c8a41adbb26ea178a13757b7 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Fri, 6 Mar 2026 17:14:09 +0800 Subject: [PATCH 4/4] [InsertSync] remove scalar cross-pipe PIPE_ALL fallback --- .../InsertSync/InsertSyncAnalysis.cpp | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index 3bb3b034..b91e9f68 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -24,18 +24,6 @@ static bool isValidPipeIndex(PipelineType pipe) { return static_cast(pipe) < kPipeStateSize; } -static bool needsPipeAllBarrier(PipelineType srcPipe, PipelineType dstPipe) { - // A3 runtime is unstable for these scalar synchronization forms: - // 1) PIPE_S -> PIPE_MTE2 - // 2) PIPE_MTE3 -> PIPE_S - // Conservatively fall back to PIPE_ALL barrier to preserve correctness. - if (srcPipe == PipelineType::PIPE_S && dstPipe == PipelineType::PIPE_MTE2) - return true; - if (srcPipe == PipelineType::PIPE_MTE3 && dstPipe == PipelineType::PIPE_S) - return true; - return false; -} - // ============================================================================== // 1. Entry Point // ============================================================================== @@ -342,19 +330,7 @@ void InsertSyncAnalysis::InsertSyncOperation( PipelineType nowPipe = nowCompound->kPipeValue; PipelineType frontPipe = frontCompound->kPipeValue; - if (needsPipeAllBarrier(frontPipe, nowPipe)) { - unsigned insertBarrierId = nowCompound->GetIndex(); - auto barrierOp = std::make_unique( - SyncOperation::TYPE::PIPE_BARRIER, PipelineType::PIPE_ALL, - PipelineType::PIPE_ALL, syncIndex_, insertBarrierId, forEndIndex); - barrierOp->SetDepSyncIRIndex(frontCompound->GetIndex()); - syncIR_[insertBarrierId]->pipeBefore.push_back(barrierOp.get()); - barrierOp->SetSyncIRIndex(insertBarrierId); - - SmallVector> newSync; - newSync.emplace_back(std::move(barrierOp)); - syncOperations_.emplace_back(std::move(newSync)); - } else if (nowPipe == frontPipe) { + if (nowPipe == frontPipe) { unsigned insertBarrierId = nowCompound->GetIndex(); auto barrierOp = std::make_unique( SyncOperation::TYPE::PIPE_BARRIER, frontPipe, nowPipe, syncIndex_,