diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td
index c967a75e..28b1c85d 100644
--- a/include/PTO/IR/PTOOps.td
+++ b/include/PTO/IR/PTOOps.td
@@ -100,6 +100,7 @@ def AddPtrOp : PTO_Op<"addptr", [
 //===----------------------------------------------------------------------===//
 
 def LoadScalarOp : PTO_Op<"load_scalar", [
+    OpPipeInterface,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
   ]> {
   let summary = "Load a single scalar element from a pointer at offset.";
@@ -116,9 +117,14 @@ def LoadScalarOp : PTO_Op<"load_scalar", [
   let assemblyFormat = [{
     $ptr `[` $offset `]` attr-dict `:` type($ptr) `->` type($value)
   }];
+
+  let extraClassDeclaration = [{
+    ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; }
+  }];
 }
 
 def StoreScalarOp : PTO_Op<"store_scalar", [
+    OpPipeInterface,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
   ]> {
   let summary = "Store a single scalar element to a pointer at offset.";
@@ -136,6 +142,10 @@ def StoreScalarOp : PTO_Op<"store_scalar", [
   let assemblyFormat = [{
     $value `,` $ptr `[` $offset `]` attr-dict `:` type($ptr) `,` type($value)
   }];
+
+  let extraClassDeclaration = [{
+    ::mlir::pto::PIPE getPipe() { return ::mlir::pto::PIPE::PIPE_S; }
+  }];
 }
 
 def MakeTensorViewOp : PTO_Op<"make_tensor_view", [AttrSizedOperandSegments]> {
diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp
index a33bf889..b91e9f68 100644
--- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp
+++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp
@@ -1,4 +1,5 @@
 #include "PTO/Transforms/InsertSync/InsertSyncAnalysis.h"
+#include "PTO/IR/PTO.h"
 #include "PTO/Transforms/InsertSync/SyncCommon.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -121,9 +122,9 @@ bool InsertSyncAnalysis::IsNoNeedToInsertSync(
   const PipelineType frontPipe = frontCompound->kPipeValue;
   const PipelineType nowPipe = nowCompound->kPipeValue;
 
-  if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S) {
+  // Scalar pipe is in-order on target hardware; skip same-pipe sync.
+  if (frontPipe == nowPipe && frontPipe == PipelineType::PIPE_S)
     return true;
-  }
 
   if (nowCompound->elementOp == frontCompound->elementOp && !isBackwardDep) {
     return true;
diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
index 33aec28b..b5e3b463 100644
--- a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
+++ b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
@@ -7,6 +7,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Matchers.h"
+#include <algorithm>
 // [P0 新增] 引入副作用接口和 PTO 接口
 #include "mlir/Interfaces/SideEffectInterfaces.h"
  
@@ -15,18 +16,41 @@
 using namespace mlir;
 using namespace mlir::pto;
  
+static int64_t getElementSizeInBytes(Type elemType) {
+  if (auto intTy = dyn_cast<IntegerType>(elemType)) {
+    return std::max<int64_t>(1, intTy.getWidth() / 8);
+  }
+  if (auto floatTy = dyn_cast<FloatType>(elemType)) {
+    return std::max<int64_t>(1, floatTy.getWidth() / 8);
+  }
+  if (isa<IndexType>(elemType)) {
+    return 8;
+  }
+  return 1;
+}
+
 // [辅助函数] 尝试从 Operation 中计算相对于 Source 的字节偏移量和新大小
 // 返回值: pair<offsetInBytes, sizeInBytes>
 // 如果无法计算静态值，返回 {-1, -1} 表示这是动态的
 static std::pair<int64_t, int64_t> getStaticOffsetAndSize(Operation *op, Value src) {
-  auto srcType = dyn_cast<MemRefType>(src.getType());
-  if (!srcType) return {0, 0};
-  
-  int64_t elemSize = srcType.getElementType().getIntOrFloatBitWidth() / 8;
-  if (elemSize == 0) elemSize = 1;
+  Type srcElemType = nullptr;
+  if (auto srcType = dyn_cast<MemRefType>(src.getType())) {
+    srcElemType = srcType.getElementType();
+  } else if (auto ptrType = dyn_cast<pto::PtrType>(src.getType())) {
+    srcElemType = ptrType.getElementType();
+  } else {
+    return {0, 0};
+  }
+
+  const int64_t elemSize = getElementSizeInBytes(srcElemType);
  
   // === Case 1: memref.subview ===
   if (auto subView = dyn_cast<memref::SubViewOp>(op)) {
+    auto srcType = dyn_cast<MemRefType>(src.getType());
+    if (!srcType) {
+      return {-1, -1};
+    }
+
     int64_t baseOffset;
     SmallVector<int64_t, 4> strides;
     if (failed(mlir::getStridesAndOffset(srcType, strides, baseOffset))) {
@@ -71,6 +95,15 @@ static std::pair<int64_t, int64_t> getStaticOffsetAndSize(Operation *op, Value s
     }
     return {staticOffsets[0] * elemSize, 0}; 
   }
+
+  // === Case 3: pto.addptr ===
+  if (auto addPtrOp = dyn_cast<pto::AddPtrOp>(op)) {
+    llvm::APInt apIntValue;
+    if (!matchPattern(addPtrOp.getOffset(), m_ConstantInt(&apIntValue))) {
+      return {-1, -1};
+    }
+    return {apIntValue.getSExtValue() * elemSize, 0};
+  }
  
   return {0, 0};
 }
@@ -138,6 +171,9 @@ void PTOIRTranslator::RecursionIR(Region *region) {
     else if (auto bindTileOp = dyn_cast<pto::BindTileOp>(op)) {
       UpdateAliasBufferInfo(bindTileOp.getResult(), bindTileOp.getSource());
     }
+    else if (auto addPtrOp = dyn_cast<pto::AddPtrOp>(op)) {
+      UpdateAliasBufferInfo(addPtrOp.getResult(), addPtrOp.getPtr());
+    }
     else if (auto subViewOp = dyn_cast<pto::PartitionViewOp>(op)) {
       UpdateAliasBufferInfo(subViewOp.getResult(), subViewOp.getSource());
     } 
@@ -496,28 +532,38 @@ void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) {
   if (!buffer2MemInfoMap_.contains(source)) return;
  
   int64_t deltaOffset = 0;
-  int64_t newSize = -1; 
+  int64_t newSize = -1;
+  bool hasUnknownAliasRange = false;
  
   if (auto op = result.getDefiningOp()) {
     auto info = getStaticOffsetAndSize(op, source);
-    if (info.first != -1) {
-        deltaOffset = info.first;
-        if (info.second > 0) newSize = info.second;
-    } 
+    if (info.first == -1) {
+      hasUnknownAliasRange = true;
+    } else {
+      deltaOffset = info.first;
+      if (info.second > 0) newSize = info.second;
+    }
   }
  
   auto &resultMemInfoVec = buffer2MemInfoMap_[result];
   
   for (auto &parentInfo : buffer2MemInfoMap_[source]) {
     auto newInfo = parentInfo->clone(result);
- 
-    if (!newInfo->baseAddresses.empty()) {
-        newInfo->baseAddresses[0] += deltaOffset;
+
+    if (hasUnknownAliasRange) {
+      // Dynamic pointer arithmetic cannot be modeled precisely here.
+      // Keep root/scope aliasing, but drop concrete range info conservatively.
+      newInfo->baseAddresses.clear();
+      newInfo->allocateSize = 0;
     } else {
+      if (!newInfo->baseAddresses.empty()) {
+        newInfo->baseAddresses[0] += deltaOffset;
+      } else {
         newInfo->baseAddresses.push_back(deltaOffset);
+      }
     }
- 
-    if (newSize > 0) {
+
+    if (!hasUnknownAliasRange && newSize > 0) {
         newInfo->allocateSize = newSize;
     }
  
diff --git a/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py
new file mode 100644
index 00000000..47310dc3
--- /dev/null
+++ b/test/samples/Sync/test_inject_sync_scalar_cross_pipe.py
@@ -0,0 +1,66 @@
+from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType
+from mlir.dialects import func, arith, pto
+
+
+def _idx_const(v: int):
+    return arith.ConstantOp(IndexType.get(), v).result
+
+
+def build():
+    with Context() as ctx:
+        pto.register_dialect(ctx, load=True)
+
+        with Location.unknown(ctx):
+            m = Module.create()
+
+            f32 = F32Type.get(ctx)
+            ptr_f32 = pto.PtrType.get(f32, ctx)
+            tv2 = pto.TensorViewType.get(2, f32, ctx)
+            tile_view = pto.PartitionTensorViewType.get([16, 16], f32, ctx)
+
+            vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx)
+            bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx)
+            sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx)
+            pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx)
+            cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx)
+            tile_buf = pto.TileBufType.get([16, 16], f32, vec, [16, 16], cfg, ctx)
+
+            fn_ty = func.FunctionType.get([ptr_f32, ptr_f32], [])
+            with InsertionPoint(m.body):
+                fn = func.FuncOp("test_scalar_cross_pipe", fn_ty)
+                entry = fn.add_entry_block()
+
+            with InsertionPoint(entry):
+                src, dst = entry.arguments
+                c0 = _idx_const(0)
+                c1 = _idx_const(1)
+                c4 = _idx_const(4)
+                c16 = _idx_const(16)
+
+                src_tv = pto.MakeTensorViewOp(tv2, src, [c16, c16], [c16, c1]).result
+                dst_tv = pto.MakeTensorViewOp(tv2, dst, [c16, c16], [c16, c1]).result
+                src_part = pto.PartitionViewOp(
+                    tile_view, src_tv, offsets=[c0, c0], sizes=[c16, c16]
+                ).result
+                dst_part = pto.PartitionViewOp(
+                    tile_view, dst_tv, offsets=[c0, c0], sizes=[c16, c16]
+                ).result
+                ub = pto.AllocTileOp(tile_buf).result
+
+                src_off = pto.addptr(src, c4)
+                dst_off = pto.addptr(dst, c4)
+                one = arith.ConstantOp(f32, 1.0).result
+                pto.TLoadOp(None, src_part, ub)
+                pto.store_scalar(src_off, c0, one)
+                loaded = pto.load_scalar(f32, dst_off, c0)
+                pto.store_scalar(dst_off, c1, loaded)
+                pto.TStoreOp(None, ub, dst_part)
+
+                func.ReturnOp([])
+
+            m.operation.verify()
+            return m
+
+
+if __name__ == "__main__":
+    print(build())
diff --git a/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py
new file mode 100644
index 00000000..6c61ae26
--- /dev/null
+++ b/test/samples/Sync/test_inject_sync_scalar_intra_pipe_barrier.py
@@ -0,0 +1,39 @@
+from mlir.ir import Context, Location, Module, InsertionPoint, F32Type, IndexType
+from mlir.dialects import func, arith, pto
+
+
+def build():
+    with Context() as ctx:
+        pto.register_dialect(ctx, load=True)
+
+        with Location.unknown(ctx):
+            m = Module.create()
+
+            f32 = F32Type.get(ctx)
+            idx = IndexType.get(ctx)
+            ptr_f32 = pto.PtrType.get(f32, ctx)
+
+            fn_ty = func.FunctionType.get([ptr_f32], [])
+            with InsertionPoint(m.body):
+                fn = func.FuncOp("test_scalar_intra_pipe_barrier", fn_ty)
+                entry = fn.add_entry_block()
+
+            with InsertionPoint(entry):
+                ptr = entry.arguments[0]
+                c0 = arith.ConstantOp(idx, 0).result
+                c4 = arith.ConstantOp(idx, 4).result
+                one = arith.ConstantOp(f32, 1.0).result
+                two = arith.ConstantOp(f32, 2.0).result
+
+                ptr_off = pto.addptr(ptr, c4)
+                pto.store_scalar(ptr_off, c0, one)
+                pto.store_scalar(ptr_off, c0, two)
+
+                func.ReturnOp([])
+
+            m.operation.verify()
+            return m
+
+
+if __name__ == "__main__":
+    print(build())
diff --git a/test/samples/runop.sh b/test/samples/runop.sh
index ef71560d..933bfb2f 100755
--- a/test/samples/runop.sh
+++ b/test/samples/runop.sh
@@ -292,6 +292,66 @@ process_one_dir() {
       fi
     fi
 
+    # Scalar sync regression: scalar load/store should participate in PIPE_S
+    # auto-sync and correctly connect with supported DMA directions.
+    if [[ "$base" == "test_inject_sync_scalar_cross_pipe" ]]; then
+      if ! grep -Eq "set_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_MTE2->PIPE_S"
+        overall=1
+        continue
+      fi
+      if ! grep -Eq "wait_flag\\(PIPE_MTE2,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_MTE2->PIPE_S"
+        overall=1
+        continue
+      fi
+      if ! grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tmissing set_flag PIPE_S->PIPE_MTE3"
+        overall=1
+        continue
+      fi
+      if ! grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE3,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tmissing wait_flag PIPE_S->PIPE_MTE3"
+        overall=1
+        continue
+      fi
+      if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \
+         grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_MTE2,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S->PIPE_MTE2 event in scalar cross-pipe case"
+        overall=1
+        continue
+      fi
+      if grep -Eq "set_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \
+         grep -Eq "wait_flag\\(PIPE_MTE3,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_MTE3->PIPE_S event in scalar cross-pipe case"
+        overall=1
+        continue
+      fi
+    fi
+
+    # Scalar intra-pipe regression: PIPE_S local dependency should not inject
+    # extra sync (PIPE_S is in-order); only function-tail PIPE_ALL remains.
+    if [[ "$base" == "test_inject_sync_scalar_intra_pipe_barrier" ]]; then
+      local bar_all_cnt
+      bar_all_cnt="$(grep -Fc "pipe_barrier(PIPE_ALL)" "$cpp" || true)"
+      if grep -Fq "pipe_barrier(PIPE_S)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tunexpected pipe_barrier(PIPE_S) for scalar intra-pipe dependency"
+        overall=1
+        continue
+      fi
+      if grep -Eq "set_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp" || \
+         grep -Eq "wait_flag\\(PIPE_S,[[:space:]]*PIPE_S,[[:space:]]*EVENT_ID[0-7]\\)" "$cpp"; then
+        echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_S<->PIPE_S event sync for scalar intra-pipe dependency"
+        overall=1
+        continue
+      fi
+      if [[ "${bar_all_cnt}" -ne 1 ]]; then
+        echo -e "${A}(${base}.py)\tFAIL\tunexpected PIPE_ALL barrier count=${bar_all_cnt} (expect 1 tail barrier)"
+        overall=1
+        continue
+      fi
+    fi
+
     # Regression guard for issue #185: barrier_sync must support op types
     # beyond TMATMUL/TVEC and lower to the expected per-pipe barrier.
     if [[ "$base" == "test_barrier_sync" ]]; then