Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/PTO/IR/PTOOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def AddPtrOp : PTO_Op<"addptr", [
//===----------------------------------------------------------------------===//

def LoadScalarOp : PTO_Op<"load_scalar", [
OpPipeInterface,
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
]> {
let summary = "Load a single scalar element from a pointer at offset.";
Expand All @@ -116,9 +117,14 @@ def LoadScalarOp : PTO_Op<"load_scalar", [
let assemblyFormat = [{
$ptr `[` $offset `]` attr-dict `:` type($ptr) `->` type($value)
}];

let extraClassDeclaration = [{
::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; }
}];
}

def StoreScalarOp : PTO_Op<"store_scalar", [
OpPipeInterface,
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
]> {
let summary = "Store a single scalar element to a pointer at offset.";
Expand All @@ -136,6 +142,10 @@ def StoreScalarOp : PTO_Op<"store_scalar", [
let assemblyFormat = [{
$value `,` $ptr `[` $offset `]` attr-dict `:` type($ptr) `,` type($value)
}];

let extraClassDeclaration = [{
::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; }
}];
}

def MakeTensorViewOp : PTO_Op<"make_tensor_view", [AttrSizedOperandSegments]> {
Expand Down
5 changes: 3 additions & 2 deletions lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "PTO/Transforms/InsertSync/InsertSyncAnalysis.h"
#include "PTO/IR/PTO.h"
#include "PTO/Transforms/InsertSync/SyncCommon.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
Expand Down Expand Up @@ -121,9 +122,9 @@ bool InsertSyncAnalysis::IsNoNeedToInsertSync(
const PipelineType frontPipe = frontCompound->kPipeValue;
const PipelineType nowPipe = nowCompound->kPipeValue;

if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) {
// Scalar pipe is in-order on target hardware; skip same-pipe sync.
if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S)
return true;
}

if (nowCompound->elementOp == frontCompound->elementOp && !isBackwardDep) {
return true;
Expand Down
76 changes: 61 additions & 15 deletions lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "llvm/Support/FormatVariadic.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Matchers.h"
#include <algorithm>
// [P0 新增] 引入副作用接口和 PTO 接口
#include "mlir/Interfaces/SideEffectInterfaces.h"

Expand All @@ -15,18 +16,41 @@
using namespace mlir;
using namespace mlir::pto;

static int64_t getElementSizeInBytes(Type elemType) {
if (auto intTy = dyn_cast<IntegerType>(elemType)) {
return std::max<int64_t>(1, intTy.getWidth() / 8);
}
if (auto floatTy = dyn_cast<FloatType>(elemType)) {
return std::max<int64_t>(1, floatTy.getWidth() / 8);
}
if (isa<IndexType>(elemType)) {
return 8;
}
return 1;
}

// [辅助函数] 尝试从 Operation 中计算相对于 Source 的字节偏移量和新大小
// 返回值: pair<offsetInBytes, sizeInBytes>
// 如果无法计算静态值,返回 {-1, -1} 表示这是动态的
static std::pair<int64_t, int64_t> getStaticOffsetAndSize(Operation *op, Value src) {
auto srcType = dyn_cast<MemRefType>(src.getType());
if (!srcType) return {0, 0};

int64_t elemSize = srcType.getElementType().getIntOrFloatBitWidth() / 8;
if (elemSize == 0) elemSize = 1;
Type srcElemType = nullptr;
if (auto srcType = dyn_cast<MemRefType>(src.getType())) {
srcElemType = srcType.getElementType();
} else if (auto ptrType = dyn_cast<pto::PtrType>(src.getType())) {
srcElemType = ptrType.getElementType();
} else {
return {0, 0};
}

const int64_t elemSize = getElementSizeInBytes(srcElemType);

// === Case 1: memref.subview ===
if (auto subView = dyn_cast<memref::SubViewOp>(op)) {
auto srcType = dyn_cast<MemRefType>(src.getType());
if (!srcType) {
return {-1, -1};
}

int64_t baseOffset;
SmallVector<int64_t, 4> strides;
if (failed(mlir::getStridesAndOffset(srcType, strides, baseOffset))) {
Expand Down Expand Up @@ -71,6 +95,15 @@ static std::pair<int64_t, int64_t> getStaticOffsetAndSize(Operation *op, Value s
}
return {staticOffsets[0] * elemSize, 0};
}

// === Case 3: pto.addptr ===
if (auto addPtrOp = dyn_cast<pto::AddPtrOp>(op)) {
llvm::APInt apIntValue;
if (!matchPattern(addPtrOp.getOffset(), m_ConstantInt(&apIntValue))) {
return {-1, -1};
}
return {apIntValue.getSExtValue() * elemSize, 0};
}

return {0, 0};
}
Expand Down Expand Up @@ -138,6 +171,9 @@ void PTOIRTranslator::RecursionIR(Region *region) {
else if (auto bindTileOp = dyn_cast<pto::BindTileOp>(op)) {
UpdateAliasBufferInfo(bindTileOp.getResult(), bindTileOp.getSource());
}
else if (auto addPtrOp = dyn_cast<pto::AddPtrOp>(op)) {
UpdateAliasBufferInfo(addPtrOp.getResult(), addPtrOp.getPtr());
}
else if (auto subViewOp = dyn_cast<pto::PartitionViewOp>(op)) {
UpdateAliasBufferInfo(subViewOp.getResult(), subViewOp.getSource());
}
Expand Down Expand Up @@ -496,28 +532,38 @@ void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) {
if (!buffer2MemInfoMap_.contains(source)) return;

int64_t deltaOffset = 0;
int64_t newSize = -1;
int64_t newSize = -1;
bool hasUnknownAliasRange = false;

if (auto op = result.getDefiningOp()) {
auto info = getStaticOffsetAndSize(op, source);
if (info.first != -1) {
deltaOffset = info.first;
if (info.second > 0) newSize = info.second;
}
if (info.first == -1) {
hasUnknownAliasRange = true;
} else {
deltaOffset = info.first;
if (info.second > 0) newSize = info.second;
}
}

auto &resultMemInfoVec = buffer2MemInfoMap_[result];

for (auto &parentInfo : buffer2MemInfoMap_[source]) {
auto newInfo = parentInfo->clone(result);

if (!newInfo->baseAddresses.empty()) {
newInfo->baseAddresses[0] += deltaOffset;

if (hasUnknownAliasRange) {
// Dynamic pointer arithmetic cannot be modeled precisely here.
// Keep root/scope aliasing, but drop concrete range info conservatively.
newInfo->baseAddresses.clear();
newInfo->allocateSize = 0;
} else {
if (!newInfo->baseAddresses.empty()) {
newInfo->baseAddresses[0] += deltaOffset;
} else {
newInfo->baseAddresses.push_back(deltaOffset);
}
}
if (newSize > 0) {

if (!hasUnknownAliasRange && newSize > 0) {
newInfo->allocateSize = newSize;
}

Expand Down
66 changes: 66 additions & 0 deletions test/samples/Sync/test_inject_sync_scalar_cross_pipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType
from mlir.dialects import func, arith, pto


def _idx_const(v: int):
return arith.ConstantOp(IndexType.get(), v).result


def build():
with Context() as ctx:
pto.register_dialect(ctx, load=True)

with Location.unknown(ctx):
m = Module.create()

f32 = F32Type.get(ctx)
ptr_f32 = pto.PtrType.get(f32, ctx)
tv2 = pto.TensorViewType.get(2, f32, ctx)
tile_view = pto.PartitionTensorViewType.get([16, 16], f32, ctx)

vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx)
sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx)
pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx)
cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx)
tile_buf = pto.TileBufType.get([16, 16], f32, vec, [16, 16], cfg, ctx)

fn_ty = func.FunctionType.get([ptr_f32, ptr_f32], [])
with InsertionPoint(m.body):
fn = func.FuncOp("test_scalar_cross_pipe", fn_ty)
entry = fn.add_entry_block()

with InsertionPoint(entry):
src, dst = entry.arguments
c0 = _idx_const(0)
c1 = _idx_const(1)
c4 = _idx_const(4)
c16 = _idx_const(16)

src_tv = pto.MakeTensorViewOp(tv2, src, [c16, c16], [c16, c1]).result
dst_tv = pto.MakeTensorViewOp(tv2, dst, [c16, c16], [c16, c1]).result
src_part = pto.PartitionViewOp(
tile_view, src_tv, offsets=[c0, c0], sizes=[c16, c16]
).result
dst_part = pto.PartitionViewOp(
tile_view, dst_tv, offsets=[c0, c0], sizes=[c16, c16]
).result
ub = pto.AllocTileOp(tile_buf).result

src_off = pto.addptr(src, c4)
dst_off = pto.addptr(dst, c4)
one = arith.ConstantOp(f32, 1.0).result
pto.TLoadOp(None, src_part, ub)
pto.store_scalar(src_off, c0, one)
loaded = pto.load_scalar(f32, dst_off, c0)
pto.store_scalar(dst_off, c1, loaded)
pto.TStoreOp(None, ub, dst_part)

func.ReturnOp([])

m.operation.verify()
return m


if __name__ == "__main__":
print(build())
39 changes: 39 additions & 0 deletions test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType
from mlir.dialects import func, arith, pto


def build():
with Context() as ctx:
pto.register_dialect(ctx, load=True)

with Location.unknown(ctx):
m = Module.create()

f32 = F32Type.get(ctx)
idx = IndexType.get(ctx)
ptr_f32 = pto.PtrType.get(f32, ctx)

fn_ty = func.FunctionType.get([ptr_f32], [])
with InsertionPoint(m.body):
fn = func.FuncOp("test_scalar_intra_pipe_barrier", fn_ty)
entry = fn.add_entry_block()

with InsertionPoint(entry):
ptr = entry.arguments[0]
c0 = arith.ConstantOp(idx, 0).result
c4 = arith.ConstantOp(idx, 4).result
one = arith.ConstantOp(f32, 1.0).result
two = arith.ConstantOp(f32, 2.0).result

ptr_off = pto.addptr(ptr, c4)
pto.store_scalar(ptr_off, c0, one)
pto.store_scalar(ptr_off, c0, two)

func.ReturnOp([])

m.operation.verify()
return m


if __name__ == "__main__":
print(build())
60 changes: 60 additions & 0 deletions test/samples/runop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,66 @@ process_one_dir() {
fi
fi

# Scalar sync regression: scalar load/store should participate in PIPE_S
# auto-sync and correctly connect with supported DMA directions.
if [[ "$base" == "test_inject_sync_scalar_cross_pipe" ]]; then
if ! grep -Eq "set_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE2->PIPE_S"
overall=1
continue
fi
if ! grep -Eq "wait_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE2->PIPE_S"
overall=1
continue
fi
if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE3"
overall=1
continue
fi
if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE3"
overall=1
continue
fi
if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \
grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S->PIPE_MTE2 event in scalar cross-pipe case"
overall=1
continue
fi
if grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \
grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_MTE3->PIPE_S event in scalar cross-pipe case"
overall=1
continue
fi
fi

# Scalar intra-pipe regression: PIPE_S local dependency should not inject
# extra sync (PIPE_S is in-order); only function-tail PIPE_ALL remains.
if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then
local bar_all_cnt
bar_all_cnt="$(grep -Fc "pipe_barrier(PIPE_ALL)" "$cpp" || true)"
if grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tunexpected pipe_barrier(PIPE_S) for scalar intra-pipe dependency"
overall=1
continue
fi
if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \
grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S<->PIPE_S event sync for scalar intra-pipe dependency"
overall=1
continue
fi
if [[ "${bar_all_cnt}" -ne 1 ]]; then
echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_ALL barrier count=${bar_all_cnt} (expect 1 tail barrier)"
overall=1
continue
fi
fi

# Regression guard for issue #185: barrier_sync must support op types
# beyond TMATMUL/TVEC and lower to the expected per-pipe barrier.
if [[ "$base" == "test_barrier_sync" ]]; then
Expand Down