From 144b9b71b3f8c3ea2f65f1ee0de6740237218c5f Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Sat, 14 Mar 2026 20:35:11 +0800 Subject: [PATCH 1/4] feat: add tassemble lowering via TINSERT --- docs/PTO_IR_manual.md | 38 +++++++++++++++++ include/PTO/IR/PTOOps.td | 44 +++++++++++++++++++ lib/PTO/IR/PTO.cpp | 70 +++++++++++++++++++++++++++++++ lib/PTO/Transforms/PTOToEmitC.cpp | 27 +++++++++++- test/samples/Assemble/assemble.py | 45 ++++++++++++++++++++ test/samples/runop.sh | 8 ++++ 6 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 test/samples/Assemble/assemble.py diff --git a/docs/PTO_IR_manual.md b/docs/PTO_IR_manual.md index 03a6667d..8c081be7 100644 --- a/docs/PTO_IR_manual.md +++ b/docs/PTO_IR_manual.md @@ -4517,6 +4517,44 @@ pto.treshape ins(%src : !pto.tile_buf<...>) outs(%dst : !pto.tile_buf<...>) --- +##### `pto.tassemble` - Insert Sub-Tile Window + +**Summary:** Inserts a source tile into a destination tile at a given row/col offset. + +**Semantics:** + +``` +dst[i + indexRow, j + indexCol] = src[i, j] +``` + +**Arguments:** + +| Name | Type | Description | +|------|------|-------------| +| `src` | `pto.tile_buf` | Source tile | +| `indexRow` | `Index` | Destination row offset | +| `indexCol` | `Index` | Destination column offset | +| `dst` | `pto.tile_buf` | Destination tile | + +**Results:** None. Writes into `dst` via DPS pattern. + +**Constraints & Verification:** + +- The operation has a custom verifier + +**Hardware Mapping:** + +- Lowers to **`TINSERT(dst, src, indexRow, indexCol)`** +- Uses target data-movement pipeline (MTE1 by default; A5 UB->L1 path uses MTE3) + +**Basic Example:** + +```mlir +pto.tassemble ins(%src, %row, %col : !pto.tile_buf<...>, index, index) outs(%dst : !pto.tile_buf<...>) +``` + +--- + ##### `pto.textract` - Extract Sub-Tile Window **Summary:** Extracts a sub-tile window from a source tile into a destination tile. diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index 267297e2..0b181207 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -2027,6 +2027,50 @@ def TExtractOp : PTO_TOp<"textract", [ }]; } +def TAssembleOp : PTO_TOp<"tassemble", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Insert src sub-tile into dst at (indexRow, indexCol) (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + Index:$indexRow, + Index:$indexCol, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $indexRow `,` $indexCol `:` qualified(type($src)) `,` type($indexRow) `,` type($indexCol) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + // TINSERT runs on different DMA pipes across targets. + // - A5 (Ascend950/910_95): UB->L1 path is MTE3 in pto-isa custom kernels. + // - Others: keep MTE1 for compatibility with existing data-movement sync. + ::mlir::pto::PIPE getPipe() { + auto moduleOp = getOperation()->getParentOfType<::mlir::ModuleOp>(); + if (moduleOp) { + if (auto spec = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.device-spec")) { + auto s = spec.getValue(); + if (s.starts_with("Ascend950") || s.starts_with("Ascend910_95")) { + return ::mlir::pto::PIPE::PIPE_MTE3; + } + } + } + return ::mlir::pto::PIPE::PIPE_MTE1; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + def TFillPadOp : PTO_TOp<"tfillpad", [ PTO_DpsInitOpInterface, OpPipeInterface, diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 02670518..36a240c4 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -1528,6 +1528,69 @@ mlir::LogicalResult mlir::pto::TExtractOp::verify() { return mlir::success(); } //===----------------------------------------------------------------------===// +// TAssembleOp_DPS verifier +//===----------------------------------------------------------------------===// + +mlir::LogicalResult mlir::pto::TAssembleOp::verify() { + Type srcTy = getSrc().getType(); + Type dstTy = getDst().getType(); + if (!isPTOShapedLike(srcTy) || !isPTOShapedLike(dstTy)) + return emitOpError("expects src/dst to be PTO shaped-like types"); + + auto srcShape = getShapeVec(srcTy); + auto dstShape = getShapeVec(dstTy); + if (srcShape.size() != 2 || dstShape.size() != 2) + return emitOpError("expects rank-2 shaped types for src/dst"); + + if (getElemTy(srcTy) != getElemTy(dstTy)) + return emitOpError("expects src element type == dst element type"); + + if (!getIndexRow().getType().isIndex() || !getIndexCol().getType().isIndex()) + return emitOpError("expects indexRow/indexCol to be index type"); + + auto readConstIndex = [&](Value v, int64_t &out) -> bool { + if (auto cOp = v.getDefiningOp()) { + out = cOp.value(); + return true; + } + if (auto cInt = v.getDefiningOp()) { + out = cInt.value(); + return true; + } + if (auto cOp = v.getDefiningOp()) { + if (auto ia = mlir::dyn_cast(cOp.getValue())) { + out = ia.getInt(); + return true; + } + } + return false; + }; + + int64_t r0 = 0; + int64_t c0 = 0; + bool rowConst = readConstIndex(getIndexRow(), r0); + bool colConst = readConstIndex(getIndexCol(), c0); + if (rowConst && r0 < 0) + return emitOpError("indexRow must be non-negative"); + if (colConst && c0 < 0) + return emitOpError("indexCol must be non-negative"); + + int64_t srcRows = srcShape[0]; + int64_t srcCols = srcShape[1]; + int64_t dstRows = dstShape[0]; + int64_t dstCols = dstShape[1]; + if (rowConst && srcRows != mlir::ShapedType::kDynamic && + dstRows != mlir::ShapedType::kDynamic && + r0 + srcRows > dstRows) + return emitOpError("indexRow + src rows exceeds dst rows"); + if (colConst && srcCols != mlir::ShapedType::kDynamic && + dstCols != mlir::ShapedType::kDynamic && + c0 + srcCols > dstCols) + return emitOpError("indexCol + src cols exceeds dst cols"); + + return mlir::success(); +} +//===----------------------------------------------------------------------===// // TFillPadOp_DPS verifier //===----------------------------------------------------------------------===// @@ -4282,6 +4345,13 @@ void TExtractOp::getEffects( PTO_ADD_WRITE(getDstMutable()); } +// TASSEMBLE: Read(src) -> Write(dst) +void TAssembleOp::getEffects( + SmallVectorImpl> &effects) { + PTO_ADD_READ(getSrcMutable()); + PTO_ADD_WRITE(getDstMutable()); +} + PTO_DEFINE_UNARY_EFFECTS(TFillPadOp, getSrcMutable(), getDstMutable()) PTO_DEFINE_UNARY_EFFECTS(TFillPadExpandOp, getSrcMutable(), getDstMutable()) diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index 939287dd..5a95250a 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -4743,6 +4743,31 @@ struct PTOExtractToEmitC : public OpConversionPattern { } }; //===----------------------------------------------------------------------===// +// pto.tassemble lowering -> TINSERT(dst, src, indexRow, indexCol) +//===----------------------------------------------------------------------===// + +struct PTOAssembleToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAssembleOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value r0 = peelUnrealized(adaptor.getIndexRow()); + Value c0 = peelUnrealized(adaptor.getIndexCol()); + + rewriter.create( + loc, TypeRange{}, "TINSERT", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src, r0, c0}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// // pto.tfillpad lowering -> TFILLPAD(dst, src) //===----------------------------------------------------------------------===// @@ -7298,7 +7323,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); diff --git a/test/samples/Assemble/assemble.py b/test/samples/Assemble/assemble.py new file mode 100644 index 00000000..d3795199 --- /dev/null +++ b/test/samples/Assemble/assemble.py @@ -0,0 +1,45 @@ +from mlir.ir import Context, Location, Module, InsertionPoint +from mlir.dialects import func, arith, pto +from mlir.ir import F16Type, IndexType + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f16 = F16Type.get(ctx) + idx = IndexType.get(ctx) + + vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) + mat = pto.AddressSpaceAttr.get(pto.AddressSpace.MAT, ctx) + bl = pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx) + sl = pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx) + pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) + cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx) + + src_ty = pto.TileBufType.get([16, 16], f16, vec, [16, 16], cfg, ctx) + dst_ty = pto.TileBufType.get([32, 32], f16, mat, [32, 32], cfg, ctx) + + fn_ty = func.FunctionType.get([], []) + with InsertionPoint(m.body): + fn = func.FuncOp("assemble_demo", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + c16 = arith.ConstantOp(idx, 16).result + + src = pto.AllocTileOp(src_ty).result + dst = pto.AllocTileOp(dst_ty).result + pto.TAssembleOp(src, c16, c16, dst) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/runop.sh b/test/samples/runop.sh index ee50e931..335d760a 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -463,6 +463,14 @@ PY fi fi + if [[ "$base" == "assemble" ]]; then + if ! grep -Fq "TINSERT(" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tmissing TINSERT() lowering for pto.tassemble" + overall=1 + continue + fi + fi + if [[ "$base" == "fillpad" ]]; then if ! grep -Fq "TFILLPAD(" "$cpp"; then echo -e "${A}(${base}.py)\tFAIL\tmissing TFILLPAD() lowering for pto.tfillpad" From 0f489e94f67d9f4ec260dea2249ae657e845396e Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Sat, 14 Mar 2026 20:56:29 +0800 Subject: [PATCH 2/4] test: add tassemble golden and A3 board validation harness --- test/samples/Assemble/assemble.golden | 4 + test/samples/Assemble/assemble.py | 42 +++-- .../Assemble/board_validation/CMakeLists.txt | 129 +++++++++++++ .../Assemble/board_validation/README.md | 25 +++ .../board_validation/assemble_kernel.cpp | 93 ++++++++++ .../Assemble/board_validation/compare.py | 169 ++++++++++++++++++ .../Assemble/board_validation/golden.py | 29 +++ .../Assemble/board_validation/launch.cpp | 65 +++++++ .../Assemble/board_validation/main.cpp | 126 +++++++++++++ .../Assemble/board_validation/outputs.txt | 1 + test/samples/Assemble/board_validation/run.sh | 53 ++++++ test/samples/runop.sh | 19 +- 12 files changed, 743 insertions(+), 12 deletions(-) create mode 100644 test/samples/Assemble/assemble.golden create mode 100644 test/samples/Assemble/board_validation/CMakeLists.txt create mode 100644 test/samples/Assemble/board_validation/README.md create mode 100644 test/samples/Assemble/board_validation/assemble_kernel.cpp create mode 100644 test/samples/Assemble/board_validation/compare.py create mode 100644 test/samples/Assemble/board_validation/golden.py create mode 100644 test/samples/Assemble/board_validation/launch.cpp create mode 100644 test/samples/Assemble/board_validation/main.cpp create mode 100644 test/samples/Assemble/board_validation/outputs.txt create mode 100755 test/samples/Assemble/board_validation/run.sh diff --git a/test/samples/Assemble/assemble.golden b/test/samples/Assemble/assemble.golden new file mode 100644 index 00000000..46a9385c --- /dev/null +++ b/test/samples/Assemble/assemble.golden @@ -0,0 +1,4 @@ +# Regex patterns that must appear in emitted C++ for assemble.py +TLOAD\( +TINSERT\( +TSTORE\( diff --git a/test/samples/Assemble/assemble.py b/test/samples/Assemble/assemble.py index d3795199..d6be9fc8 100644 --- a/test/samples/Assemble/assemble.py +++ b/test/samples/Assemble/assemble.py @@ -1,6 +1,6 @@ from mlir.ir import Context, Location, Module, InsertionPoint from mlir.dialects import func, arith, pto -from mlir.ir import F16Type, IndexType +from mlir.ir import F32Type, IndexType def build(): @@ -10,30 +10,52 @@ def build(): with Location.unknown(ctx): m = Module.create() - f16 = F16Type.get(ctx) + f32 = F32Type.get(ctx) idx = IndexType.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + + tv2_f32 = pto.TensorViewType.get(2, f32, ctx) + tile_view_16 = pto.PartitionTensorViewType.get([16, 16], f32, ctx) + tile_view_32 = pto.PartitionTensorViewType.get([32, 32], f32, ctx) vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) - mat = pto.AddressSpaceAttr.get(pto.AddressSpace.MAT, ctx) - bl = pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx) - sl = pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx) + bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) + sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx) - src_ty = pto.TileBufType.get([16, 16], f16, vec, [16, 16], cfg, ctx) - dst_ty = pto.TileBufType.get([32, 32], f16, mat, [32, 32], cfg, ctx) + src_ty = pto.TileBufType.get([16, 16], f32, vec, [16, 16], cfg, ctx) + dst_ty = pto.TileBufType.get([32, 32], f32, vec, [32, 32], cfg, ctx) - fn_ty = func.FunctionType.get([], []) + fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f32], []) with InsertionPoint(m.body): - fn = func.FuncOp("assemble_demo", fn_ty) + fn = func.FuncOp("assemble_kernel", fn_ty) entry = fn.add_entry_block() with InsertionPoint(entry): + c0 = arith.ConstantOp(idx, 0).result + c1 = arith.ConstantOp(idx, 1).result + c8 = arith.ConstantOp(idx, 8).result c16 = arith.ConstantOp(idx, 16).result + c32 = arith.ConstantOp(idx, 32).result + + arg_src, arg_dst, arg_out = entry.arguments + + tv_src = pto.MakeTensorViewOp(tv2_f32, arg_src, [c16, c16], [c16, c1]).result + tv_dst = pto.MakeTensorViewOp(tv2_f32, arg_dst, [c32, c32], [c32, c1]).result + tv_out = pto.MakeTensorViewOp(tv2_f32, arg_out, [c32, c32], [c32, c1]).result + + sv_src = pto.PartitionViewOp(tile_view_16, tv_src, offsets=[c0, c0], sizes=[c16, c16]).result + sv_dst = pto.PartitionViewOp(tile_view_32, tv_dst, offsets=[c0, c0], sizes=[c32, c32]).result + sv_out = pto.PartitionViewOp(tile_view_32, tv_out, offsets=[c0, c0], sizes=[c32, c32]).result src = pto.AllocTileOp(src_ty).result dst = pto.AllocTileOp(dst_ty).result - pto.TAssembleOp(src, c16, c16, dst) + + pto.TLoadOp(None, sv_src, src) + pto.TLoadOp(None, sv_dst, dst) + pto.TAssembleOp(src, c8, c8, dst) + pto.TStoreOp(None, dst, sv_out) func.ReturnOp([]) diff --git a/test/samples/Assemble/board_validation/CMakeLists.txt b/test/samples/Assemble/board_validation/CMakeLists.txt new file mode 100644 index 00000000..88e1dd26 --- /dev/null +++ b/test/samples/Assemble/board_validation/CMakeLists.txt @@ -0,0 +1,129 @@ +cmake_minimum_required(VERSION 3.16) + +# Prefer setting compilers before project() so CMake picks up bisheng correctly. +set(CMAKE_C_COMPILER bisheng) +set(CMAKE_CXX_COMPILER bisheng) + +project(assemble_npu_validation) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +if(NOT DEFINED SOC_VERSION) + set(SOC_VERSION Ascend910) +endif() +option(ENABLE_SIM_GOLDEN "Build Ascend simulator (camodel) executable" ON) + +if(NOT DEFINED ENV{ASCEND_HOME_PATH}) + message(FATAL_ERROR "Cannot find ASCEND_HOME_PATH, please source the CANN set_env.sh.") +else() + set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) +endif() + +set(PTO_ISA_ROOT "" CACHE PATH "Path to pto-isa repo") +if(NOT PTO_ISA_ROOT) + set(_PTO_ISA_CANDIDATES + "${CMAKE_CURRENT_LIST_DIR}/../../../../pto-isa" + "${CMAKE_CURRENT_LIST_DIR}/../../../../../pto-isa" + "${CMAKE_CURRENT_LIST_DIR}/../../../../../../pto-isa" + ) + foreach(_cand IN LISTS _PTO_ISA_CANDIDATES) + if(EXISTS "${_cand}/include" AND EXISTS "${_cand}/tests/common") + set(PTO_ISA_ROOT "${_cand}" CACHE PATH "Path to pto-isa repo" FORCE) + break() + endif() + endforeach() +endif() +if(NOT PTO_ISA_ROOT) + message(FATAL_ERROR "Cannot find PTO_ISA_ROOT, please pass -DPTO_ISA_ROOT=/path/to/pto-isa.") +endif() + +set(ASCEND_DRIVER_PATH /usr/local/Ascend/driver) + +add_compile_options( + -D_FORTIFY_SOURCE=2 + -O2 -std=c++17 + -Wno-macro-redefined -Wno-ignored-attributes + -fstack-protector-strong + -fPIC +) +add_link_options( + -s + -Wl,-z,relro + -Wl,-z,now +) + + set(CMAKE_CCE_COMPILE_OPTIONS + -xcce + -fenable-matrix + --cce-aicore-enable-tl + + + -fPIC + -Xhost-start -Xhost-end + "SHELL:-mllvm -cce-aicore-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-function-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-record-overflow=true" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm -cce-aicore-dcci-insert-for-scalar=false" +) + +set(CMAKE_CPP_COMPILE_OPTIONS + -xc++ + "SHELL:-include stdint.h" + "SHELL:-include stddef.h" +) + +include_directories( + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common + ${ASCEND_HOME_PATH}/include + ${ASCEND_DRIVER_PATH}/kernel/inc +) + + add_library(assemble_kernel SHARED assemble_kernel.cpp launch.cpp) + target_compile_options(assemble_kernel PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --cce-aicore-arch=dav-c220-vec -DMEMORY_BASE -std=c++17) + target_include_directories(assemble_kernel PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/ + ${ASCEND_HOME_PATH}/pkg_inc/profiling/ + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime + ) +target_link_options(assemble_kernel PRIVATE --cce-fatobj-link) + +add_executable(assemble main.cpp) +target_compile_options(assemble PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) +target_include_directories(assemble PRIVATE + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common +) + +target_link_directories(assemble PUBLIC + ${ASCEND_HOME_PATH}/lib64 +) + +target_link_libraries(assemble PRIVATE + assemble_kernel + runtime + stdc++ ascendcl m tiling_api platform c_sec dl nnopbase +) + +if(ENABLE_SIM_GOLDEN) + # Simulator executable: used to generate golden outputs (Ascend camodel). + add_executable(assemble_sim main.cpp) + target_compile_options(assemble_sim PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) + target_include_directories(assemble_sim PRIVATE + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common + ) + target_link_directories(assemble_sim PUBLIC + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SOC_VERSION}/lib + ${ASCEND_HOME_PATH}/simulator/${SOC_VERSION}/lib + ${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib + ) + target_link_libraries(assemble_sim PRIVATE + assemble_kernel + runtime_camodel + stdc++ ascendcl m tiling_api platform c_sec dl nnopbase + ) +endif() diff --git a/test/samples/Assemble/board_validation/README.md b/test/samples/Assemble/board_validation/README.md new file mode 100644 index 00000000..d82824eb --- /dev/null +++ b/test/samples/Assemble/board_validation/README.md @@ -0,0 +1,25 @@ +# Assemble NPU Validation (A3) + +This test validates `pto.tassemble` functional correctness on board. + +Semantics under test: + +`dst[i + 8, j + 8] = src[i, j]` for `0 <= i,j < 16`. + +Inputs/outputs: + +- `v1.bin`: source tile (`16x16`, `float32`) +- `v2.bin`: destination tile (`32x32`, `float32`) +- `v3.bin`: output tile (`32x32`, `float32`) +- `golden_v3.bin`: expected output generated by CPU golden + +Run on A3: + +```bash +cd test/samples/Assemble/npu_validation +SOC_VERSION=Ascend910 ./run.sh +``` + +A passing run prints: + +`[INFO] compare passed` diff --git a/test/samples/Assemble/board_validation/assemble_kernel.cpp b/test/samples/Assemble/board_validation/assemble_kernel.cpp new file mode 100644 index 00000000..c50cf58e --- /dev/null +++ b/test/samples/Assemble/board_validation/assemble_kernel.cpp @@ -0,0 +1,93 @@ +// --------------------------------------------------------------------------- +// PTOAS compatibility layer +// +// The upstream pto-isa headers reference some FP8/FP4 types and the +// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain +// combination (e.g. __NPU_ARCH__==2201). +// +// For our PTOAS-generated kernels we don't rely on these types today, but the +// headers still mention them in templates/static_asserts. Provide minimal +// fallbacks to keep compilation working on dav-c220. +// --------------------------------------------------------------------------- +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif + +#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) +typedef struct { unsigned char v; } hifloat8_t; +typedef struct { unsigned char v; } float8_e4m3_t; +typedef struct { unsigned char v; } float8_e5m2_t; +typedef struct { unsigned char v; } float8_e8m0_t; +typedef struct { unsigned char v; } float4_e1m2x2_t; +typedef struct { unsigned char v; } float4_e2m1x2_t; +#endif +#include + +// AICore printf support is gated behind `--cce-enable-print` on some +// toolchains. When enabled, include the CCE print header so `cce::printf` +// resolves in device compilation. +#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) +#include +#endif +#include +#include + +// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation +// path, but `bisheng -xcce` still performs a host-side parse pass. +// Provide minimal fallbacks only when the corresponding header wasn't +// pulled in by the selected arch implementation. +#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif + +#include "pto/pto-inst.hpp" +using namespace pto; +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3) { + unsigned v4 = 1024; + unsigned v5 = 32; + unsigned v6 = 256; + unsigned v7 = 16; + unsigned v8 = 1; + unsigned v9 = 0; + int32_t v10 = 32; + int32_t v11 = 16; + int32_t v12 = 8; + int32_t v13 = 1; + int64_t v14 = 0; + int64_t v15 = 1024; + using T = float; + pto::Shape<1, 1, 1, 16, 16> v16 = pto::Shape<1, 1, 1, 16, 16>(); + pto::Stride<256, 256, 256, 16, 1> v17 = pto::Stride<256, 256, 256, 16, 1>(); + GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> v18 = GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>(v1 + (v9 + v9 * (unsigned) v11 + v9 * (unsigned) v13), v16, v17); + pto::Shape<1, 1, 1, 32, 32> v19 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v20 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v21 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v2 + (v9 + v9 * (unsigned) v10 + v9 * (unsigned) v13), v19, v20); + pto::Shape<1, 1, 1, 32, 32> v22 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v23 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v24 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v3 + (v9 + v9 * (unsigned) v10 + v9 * (unsigned) v13), v22, v23); + Tile v25; + TASSIGN(v25, v14); + Tile v26; + TASSIGN(v26, v15); + TLOAD(v25, v18); + TLOAD(v26, v21); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TINSERT(v26, v25, v12, v12); + set_flag(PIPE_MTE1, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE3, EVENT_ID0); + TSTORE(v24, v26); + pipe_barrier(PIPE_ALL); + return; +} + diff --git a/test/samples/Assemble/board_validation/compare.py b/test/samples/Assemble/board_validation/compare.py new file mode 100644 index 00000000..cb360ad7 --- /dev/null +++ b/test/samples/Assemble/board_validation/compare.py @@ -0,0 +1,169 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import sys +import numpy as np + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + g = golden.astype(np.int64, copy=False) + o = output.astype(np.int64, copy=False) + else: + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_bin_prefix(golden_path, output_path, dtype, eps, count): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + try: + count = int(count) + except Exception: + print(f"[ERROR] Invalid prefix count: {count}") + return False + if count <= 0: + print(f"[ERROR] Invalid prefix count: {count}") + return False + + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np, count=count) + output = np.fromfile(output_path, dtype=dtype_np, count=count) + + if golden.size != count or output.size != count: + print( + f"[ERROR] Prefix read too small: need={count} elems, " + f"golden={golden.size}, out={output.size}" + ) + return False + + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + g = golden.astype(np.int64, copy=False) + o = output.astype(np.int64, copy=False) + else: + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" + ) + else: + print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + """ + Compare outputs of pto.tcmp / pto.tcmps. + + These ops produce a *packed predicate mask* and do not define every byte in + the logical u8 tile buffer. In practice, only the first N bytes of each row + are meaningful (packed as 64-bit chunks). Ignore the rest to avoid flaky + compares caused by undefined bytes. + """ + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + try: + rows = int(rows) + cols = int(cols) + except Exception: + print(f"[ERROR] Invalid rows/cols for packed mask compare: rows={rows} cols={cols}") + return False + if rows <= 0 or cols <= 0: + print(f"[ERROR] Invalid rows/cols for packed mask compare: rows={rows} cols={cols}") + return False + + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + + need = rows * cols + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + + # Packed mask layout: 1 predicate bit per element, packed into 64-bit words + # per row (so 8 bytes per 64 columns). For cols <= 64 we still use one word. + row_bytes = ((cols + 63) // 64) * 8 + row_bytes = min(row_bytes, cols) + + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def main(): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + ok = True + ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return + print("[INFO] compare passed") + + +if __name__ == "__main__": + main() diff --git a/test/samples/Assemble/board_validation/golden.py b/test/samples/Assemble/board_validation/golden.py new file mode 100644 index 00000000..22a1e9fc --- /dev/null +++ b/test/samples/Assemble/board_validation/golden.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import numpy as np + + +def main(): + np.random.seed(19) + + # Inputs + src = np.random.random(size=(16, 16)).astype(np.float32) + dst = np.random.random(size=(32, 32)).astype(np.float32) + + # Kernel reads an initial output buffer; keep deterministic content. + out_init = np.zeros((32, 32), dtype=np.float32) + + # Golden for TAssemble semantics: + # dst[i + 8, j + 8] = src[i, j] + golden = dst.copy() + golden[8:24, 8:24] = src + + src.tofile("v1.bin") + dst.tofile("v2.bin") + out_init.tofile("v3.bin") + golden.tofile("golden_v3.bin") + + +if __name__ == "__main__": + main() diff --git a/test/samples/Assemble/board_validation/launch.cpp b/test/samples/Assemble/board_validation/launch.cpp new file mode 100644 index 00000000..24196cc0 --- /dev/null +++ b/test/samples/Assemble/board_validation/launch.cpp @@ -0,0 +1,65 @@ +// --------------------------------------------------------------------------- +// PTOAS compatibility layer +// +// The upstream pto-isa headers reference some FP8/FP4 types and the +// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain +// combination (e.g. __NPU_ARCH__==2201). +// +// For our PTOAS-generated kernels we don't rely on these types today, but the +// headers still mention them in templates/static_asserts. Provide minimal +// fallbacks to keep compilation working on dav-c220. +// --------------------------------------------------------------------------- +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif + +#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) +typedef struct { unsigned char v; } hifloat8_t; +typedef struct { unsigned char v; } float8_e4m3_t; +typedef struct { unsigned char v; } float8_e5m2_t; +typedef struct { unsigned char v; } float8_e8m0_t; +typedef struct { unsigned char v; } float4_e1m2x2_t; +typedef struct { unsigned char v; } float4_e2m1x2_t; +#endif +#include + +// AICore printf support is gated behind `--cce-enable-print` on some +// toolchains. When enabled, include the CCE print header so `cce::printf` +// resolves in device compilation. +#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) +#include +#endif +#include +#include + +// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation +// path, but `bisheng -xcce` still performs a host-side parse pass. +// Provide minimal fallbacks only when the corresponding header wasn't +// pulled in by the selected arch implementation. +#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif + +#if defined(__CCE_AICORE__) +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3); +#else +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3); +#endif + +void LaunchAssemble_kernel(float *v1, float *v2, float *v3, void *stream) { +#if defined(__CCE_AICORE__) + assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ float*)v3); +#else + assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ float*)v3); +#endif +} diff --git a/test/samples/Assemble/board_validation/main.cpp b/test/samples/Assemble/board_validation/main.cpp new file mode 100644 index 00000000..f4fde8f9 --- /dev/null +++ b/test/samples/Assemble/board_validation/main.cpp @@ -0,0 +1,126 @@ +/** +Copyright (c) 2025 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#include "test_common.h" +#include "acl/acl.h" +#include +#include +#include + +using namespace PtoTestCommon; + +#ifndef TMRGSORT_HPP +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif + +#define ACL_CHECK(expr) \ + do { \ + const aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ + const char *_recent = aclGetRecentErrMsg(); \ + if (_recent != nullptr && _recent[0] != '\0') { \ + std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ + } \ + rc = 1; \ + goto cleanup; \ + } \ + } while (0) + +void LaunchAssemble_kernel(float *v1, float *v2, float *v3, void *stream); + +int main() { + size_t elemCount_v1 = 256; + size_t fileSize_v1 = elemCount_v1 * sizeof(float); + size_t elemCount_v2 = 1024; + size_t fileSize_v2 = elemCount_v2 * sizeof(float); + size_t elemCount_v3 = 1024; + size_t fileSize_v3 = elemCount_v3 * sizeof(float); + float *v1Host = nullptr; + float *v1Device = nullptr; + float *v2Host = nullptr; + float *v2Device = nullptr; + float *v3Host = nullptr; + float *v3Device = nullptr; + + int rc = 0; + bool aclInited = false; + bool deviceSet = false; + int deviceId = 0; + aclrtStream stream = nullptr; + + ACL_CHECK(aclInit(nullptr)); + aclInited = true; + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + ACL_CHECK(aclrtSetDevice(deviceId)); + deviceSet = true; + ACL_CHECK(aclrtCreateStream(&stream)); + + ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); + ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); + ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); + ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); + ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); + ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchAssemble_kernel(v1Device, v2Device, v3Device, stream); + + ACL_CHECK(aclrtSynchronizeStream(stream)); + ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); + + WriteFile("./v3.bin", v3Host, fileSize_v3); + +cleanup: + aclrtFree(v1Device); + aclrtFree(v2Device); + aclrtFree(v3Device); + aclrtFreeHost(v1Host); + aclrtFreeHost(v2Host); + aclrtFreeHost(v3Host); + if (stream != nullptr) { + const aclError _ret = aclrtDestroyStream(stream); + if (_ret != ACL_SUCCESS) { + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", + "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); + } + stream = nullptr; + } + if (deviceSet) { + const aclError _ret = aclrtResetDevice(deviceId); + if (_ret != ACL_SUCCESS) { + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", + "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); + } + } + if (aclInited) { + const aclError _ret = aclFinalize(); + if (_ret != ACL_SUCCESS) { + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", + "aclFinalize()", (int)_ret, __FILE__, __LINE__); + } + } + + return rc; +} diff --git a/test/samples/Assemble/board_validation/outputs.txt b/test/samples/Assemble/board_validation/outputs.txt new file mode 100644 index 00000000..29ef827e --- /dev/null +++ b/test/samples/Assemble/board_validation/outputs.txt @@ -0,0 +1 @@ +v3 diff --git a/test/samples/Assemble/board_validation/run.sh b/test/samples/Assemble/board_validation/run.sh new file mode 100755 index 00000000..945dd54d --- /dev/null +++ b/test/samples/Assemble/board_validation/run.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +SOC_VERSION="${SOC_VERSION:-Ascend910}" +BUILD_DIR="${BUILD_DIR:-build}" +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +cd "${ROOT_DIR}" +python3 "${ROOT_DIR}/golden.py" + +if [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/cann/set_env.sh" ]]; then + set +e; set +u; set +o pipefail + source "/usr/local/Ascend/cann/set_env.sh" || true + set -o pipefail; set -u; set -e +elif [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then + set +e; set +u; set +o pipefail + source "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" || true + set -o pipefail; set -u; set -e +fi + +if [[ -z "${ASCEND_HOME_PATH:-}" ]]; then + echo "[ERROR] ASCEND_HOME_PATH is not set; please source CANN env first." >&2 + exit 2 +fi + +if [[ -z "${PTO_ISA_ROOT:-}" ]]; then + search_dir="${ROOT_DIR}" + for _ in {1..8}; do + if [[ -d "${search_dir}/pto-isa/include" && -d "${search_dir}/pto-isa/tests/common" ]]; then + PTO_ISA_ROOT="${search_dir}/pto-isa" + break + fi + [[ "${search_dir}" == "/" ]] && break + search_dir="$(dirname "${search_dir}")" + done + export PTO_ISA_ROOT="${PTO_ISA_ROOT:-}" +fi + +if [[ -z "${PTO_ISA_ROOT:-}" ]]; then + echo "[ERROR] PTO_ISA_ROOT is not set and auto-detect failed." >&2 + exit 2 +fi + +mkdir -p "${ROOT_DIR}/${BUILD_DIR}" +cmake -S "${ROOT_DIR}" -B "${ROOT_DIR}/${BUILD_DIR}" \ + -DSOC_VERSION="${SOC_VERSION}" \ + -DENABLE_SIM_GOLDEN=OFF \ + -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" +cmake --build "${ROOT_DIR}/${BUILD_DIR}" --parallel + +export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" +"${ROOT_DIR}/${BUILD_DIR}/assemble" +python3 "${ROOT_DIR}/compare.py" diff --git a/test/samples/runop.sh b/test/samples/runop.sh index 335d760a..bbdb4428 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -464,11 +464,26 @@ PY fi if [[ "$base" == "assemble" ]]; then - if ! grep -Fq "TINSERT(" "$cpp"; then - echo -e "${A}(${base}.py)\tFAIL\tmissing TINSERT() lowering for pto.tassemble" + local golden_file="${dir}/assemble.golden" + local assemble_ok=1 + if [[ ! -f "${golden_file}" ]]; then + echo -e "${A}(${base}.py)\tFAIL\tmissing golden ref: ${golden_file}" overall=1 continue fi + while IFS= read -r pat || [[ -n "$pat" ]]; do + [[ -n "$pat" ]] || continue + [[ "$pat" =~ ^# ]] && continue + if ! grep -Eq "$pat" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tgolden mismatch: missing pattern '$pat'" + overall=1 + assemble_ok=0 + break + fi + done < "${golden_file}" + if [[ ${assemble_ok} -eq 0 ]]; then + continue + fi fi if [[ "$base" == "fillpad" ]]; then From 137770737660cb30a33fab4555c462e71879e0c1 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Sat, 14 Mar 2026 22:08:01 +0800 Subject: [PATCH 3/4] fix: stabilize tassemble f32->f16 path and add A3 board validation --- lib/PTO/IR/PTO.cpp | 10 +- test/samples/Assemble/assemble.py | 135 ++++++++++++---- .../Assemble/board_validation/CMakeLists.txt | 2 +- .../Assemble/board_validation/README.md | 23 ++- .../board_validation/assemble_kernel.cpp | 96 ++++++++---- .../Assemble/board_validation/compare.py | 2 +- .../Assemble/board_validation/golden.py | 22 +-- .../Assemble/board_validation/launch.cpp | 10 +- .../Assemble/board_validation/main.cpp | 26 ++- .../Assemble/board_validation/outputs.txt | 2 +- test/samples/Assemble/board_validation/run.sh | 148 ++++++++++++++---- 11 files changed, 352 insertions(+), 124 deletions(-) diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 36a240c4..2919b860 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -1542,8 +1542,14 @@ mlir::LogicalResult mlir::pto::TAssembleOp::verify() { if (srcShape.size() != 2 || dstShape.size() != 2) return emitOpError("expects rank-2 shaped types for src/dst"); - if (getElemTy(srcTy) != getElemTy(dstTy)) - return emitOpError("expects src element type == dst element type"); + Type srcElemTy = getElemTy(srcTy); + Type dstElemTy = getElemTy(dstTy); + bool sameElemTy = srcElemTy == dstElemTy; + bool castElemTy = + srcElemTy.isF32() && (dstElemTy.isF16() || dstElemTy.isBF16()); + if (!sameElemTy && !castElemTy) + return emitOpError( + "expects src/dst element types to match, or src=f32 with dst=f16/bf16"); if (!getIndexRow().getType().isIndex() || !getIndexCol().getType().isIndex()) return emitOpError("expects indexRow/indexCol to be index type"); diff --git a/test/samples/Assemble/assemble.py b/test/samples/Assemble/assemble.py index d6be9fc8..bb3de44e 100644 --- a/test/samples/Assemble/assemble.py +++ b/test/samples/Assemble/assemble.py @@ -1,6 +1,6 @@ from mlir.ir import Context, Location, Module, InsertionPoint from mlir.dialects import func, arith, pto -from mlir.ir import F32Type, IndexType +from mlir.ir import F16Type, F32Type, IndexType def build(): @@ -10,24 +10,87 @@ def build(): with Location.unknown(ctx): m = Module.create() + f16 = F16Type.get(ctx) f32 = F32Type.get(ctx) idx = IndexType.get(ctx) + ptr_f32 = pto.PtrType.get(f32, ctx) + ptr_f16 = pto.PtrType.get(f16, ctx) tv2_f32 = pto.TensorViewType.get(2, f32, ctx) - tile_view_16 = pto.PartitionTensorViewType.get([16, 16], f32, ctx) - tile_view_32 = pto.PartitionTensorViewType.get([32, 32], f32, ctx) + tv2_f16 = pto.TensorViewType.get(2, f16, ctx) - vec = pto.AddressSpaceAttr.get(pto.AddressSpace.VEC, ctx) - bl = pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx) - sl = pto.SLayoutAttr.get(pto.SLayout.NoneBox, ctx) - pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) - cfg = pto.TileBufConfigAttr.get(bl, sl, pto.TileConfig.fractalABSize, pd, ctx) + tile_view_f32 = pto.PartitionTensorViewType.get([32, 32], f32, ctx) + tile_view_f16 = pto.PartitionTensorViewType.get([32, 32], f16, ctx) - src_ty = pto.TileBufType.get([16, 16], f32, vec, [16, 16], cfg, ctx) - dst_ty = pto.TileBufType.get([32, 32], f32, vec, [32, 32], cfg, ctx) + mat = pto.AddressSpaceAttr.get(pto.AddressSpace.MAT, ctx) + left = pto.AddressSpaceAttr.get(pto.AddressSpace.LEFT, ctx) + right = pto.AddressSpaceAttr.get(pto.AddressSpace.RIGHT, ctx) + acc = pto.AddressSpaceAttr.get(pto.AddressSpace.ACC, ctx) + pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) - fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f32], []) + cfg_mat_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_left_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_right_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.ColMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_acc_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + 1024, + pd, + ctx, + ) + cfg_mat_f16 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_left_f16 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_right_f16 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.ColMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + + a_mat_ty = pto.TileBufType.get([32, 32], f32, mat, [32, 32], cfg_mat_f32, ctx) + b_mat_ty = pto.TileBufType.get([32, 32], f32, mat, [32, 32], cfg_mat_f32, ctx) + a_left_ty = pto.TileBufType.get([32, 32], f32, left, [32, 32], cfg_left_f32, ctx) + b_right_ty = pto.TileBufType.get([32, 32], f32, right, [32, 32], cfg_right_f32, ctx) + src_acc_ty = pto.TileBufType.get([32, 32], f32, acc, [32, 32], cfg_acc_f32, ctx) + dst_mat_ty = pto.TileBufType.get([32, 32], f16, mat, [32, 32], cfg_mat_f16, ctx) + out_left_ty = pto.TileBufType.get([32, 32], f16, left, [32, 32], cfg_left_f16, ctx) + i_mat_ty = pto.TileBufType.get([32, 32], f16, mat, [32, 32], cfg_mat_f16, ctx) + i_right_ty = pto.TileBufType.get([32, 32], f16, right, [32, 32], cfg_right_f16, ctx) + out_acc_ty = pto.TileBufType.get([32, 32], f32, acc, [32, 32], cfg_acc_f32, ctx) + + fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f16, ptr_f32], []) with InsertionPoint(m.body): fn = func.FuncOp("assemble_kernel", fn_ty) entry = fn.add_entry_block() @@ -35,27 +98,45 @@ def build(): with InsertionPoint(entry): c0 = arith.ConstantOp(idx, 0).result c1 = arith.ConstantOp(idx, 1).result - c8 = arith.ConstantOp(idx, 8).result - c16 = arith.ConstantOp(idx, 16).result c32 = arith.ConstantOp(idx, 32).result - arg_src, arg_dst, arg_out = entry.arguments + arg_a, arg_b, arg_i, arg_out = entry.arguments - tv_src = pto.MakeTensorViewOp(tv2_f32, arg_src, [c16, c16], [c16, c1]).result - tv_dst = pto.MakeTensorViewOp(tv2_f32, arg_dst, [c32, c32], [c32, c1]).result + tv_a = pto.MakeTensorViewOp(tv2_f32, arg_a, [c32, c32], [c32, c1]).result + tv_b = pto.MakeTensorViewOp(tv2_f32, arg_b, [c32, c32], [c32, c1]).result + tv_i = pto.MakeTensorViewOp(tv2_f16, arg_i, [c32, c32], [c32, c1]).result tv_out = pto.MakeTensorViewOp(tv2_f32, arg_out, [c32, c32], [c32, c1]).result - sv_src = pto.PartitionViewOp(tile_view_16, tv_src, offsets=[c0, c0], sizes=[c16, c16]).result - sv_dst = pto.PartitionViewOp(tile_view_32, tv_dst, offsets=[c0, c0], sizes=[c32, c32]).result - sv_out = pto.PartitionViewOp(tile_view_32, tv_out, offsets=[c0, c0], sizes=[c32, c32]).result - - src = pto.AllocTileOp(src_ty).result - dst = pto.AllocTileOp(dst_ty).result - - pto.TLoadOp(None, sv_src, src) - pto.TLoadOp(None, sv_dst, dst) - pto.TAssembleOp(src, c8, c8, dst) - pto.TStoreOp(None, dst, sv_out) + sv_a = pto.PartitionViewOp(tile_view_f32, tv_a, offsets=[c0, c0], sizes=[c32, c32]).result + sv_b = pto.PartitionViewOp(tile_view_f32, tv_b, offsets=[c0, c0], sizes=[c32, c32]).result + sv_i = pto.PartitionViewOp(tile_view_f16, tv_i, offsets=[c0, c0], sizes=[c32, c32]).result + sv_out = pto.PartitionViewOp(tile_view_f32, tv_out, offsets=[c0, c0], sizes=[c32, c32]).result + + a_mat = pto.AllocTileOp(a_mat_ty).result + b_mat = pto.AllocTileOp(b_mat_ty).result + a_left = pto.AllocTileOp(a_left_ty).result + b_right = pto.AllocTileOp(b_right_ty).result + src_acc = pto.AllocTileOp(src_acc_ty).result + dst_mat = pto.AllocTileOp(dst_mat_ty).result + out_left = pto.AllocTileOp(out_left_ty).result + i_mat = pto.AllocTileOp(i_mat_ty).result + i_right = pto.AllocTileOp(i_right_ty).result + out_acc = pto.AllocTileOp(out_acc_ty).result + + pto.TLoadOp(None, sv_a, a_mat) + pto.TLoadOp(None, sv_b, b_mat) + pto.TMovOp(None, a_mat, a_left) + pto.TMovOp(None, b_mat, b_right) + pto.TMatmulOp(None, a_left, b_right, src_acc) + + # Main operation under test: lowering must emit TINSERT(dst, src, row, col). + pto.TAssembleOp(src_acc, c0, c0, dst_mat) + + pto.TLoadOp(None, sv_i, i_mat) + pto.TMovOp(None, dst_mat, out_left) + pto.TMovOp(None, i_mat, i_right) + pto.TMatmulOp(None, out_left, i_right, out_acc) + pto.TStoreOp(None, out_acc, sv_out) func.ReturnOp([]) diff --git a/test/samples/Assemble/board_validation/CMakeLists.txt b/test/samples/Assemble/board_validation/CMakeLists.txt index 88e1dd26..18f73df7 100644 --- a/test/samples/Assemble/board_validation/CMakeLists.txt +++ b/test/samples/Assemble/board_validation/CMakeLists.txt @@ -82,7 +82,7 @@ include_directories( ) add_library(assemble_kernel SHARED assemble_kernel.cpp launch.cpp) - target_compile_options(assemble_kernel PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --cce-aicore-arch=dav-c220-vec -DMEMORY_BASE -std=c++17) + target_compile_options(assemble_kernel PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --cce-aicore-arch=dav-c220-cube -DMEMORY_BASE -std=c++17) target_include_directories(assemble_kernel PRIVATE ${ASCEND_HOME_PATH}/pkg_inc/ ${ASCEND_HOME_PATH}/pkg_inc/profiling/ diff --git a/test/samples/Assemble/board_validation/README.md b/test/samples/Assemble/board_validation/README.md index d82824eb..ccb99090 100644 --- a/test/samples/Assemble/board_validation/README.md +++ b/test/samples/Assemble/board_validation/README.md @@ -2,21 +2,30 @@ This test validates `pto.tassemble` functional correctness on board. -Semantics under test: +Semantics under test (A3-legal path): -`dst[i + 8, j + 8] = src[i, j]` for `0 <= i,j < 16`. +1. `acc0 = matmul(v1, v2)` in `ACC` +2. `tassemble(acc0 -> MAT, indexRow=0, indexCol=0)` +3. `acc1 = matmul(MAT_after_assemble, v3_identity)` +4. store `acc1` to `v4` + +Golden reference: + +- default flow uses `assemble_sim` (Ascend simulator) to produce `golden_v4.bin` +- then runs NPU kernel and compares `v4.bin` vs `golden_v4.bin` Inputs/outputs: -- `v1.bin`: source tile (`16x16`, `float32`) -- `v2.bin`: destination tile (`32x32`, `float32`) -- `v3.bin`: output tile (`32x32`, `float32`) -- `golden_v3.bin`: expected output generated by CPU golden +- `v1.bin`: input A (`32x32`, `float32`) +- `v2.bin`: input B (`32x32`, `float32`) +- `v3.bin`: identity matrix (`32x32`, `float16`) +- `v4.bin`: output tile (`32x32`, `float32`) +- `golden_v4.bin`: expected output generated by simulator golden Run on A3: ```bash -cd test/samples/Assemble/npu_validation +cd test/samples/Assemble/board_validation SOC_VERSION=Ascend910 ./run.sh ``` diff --git a/test/samples/Assemble/board_validation/assemble_kernel.cpp b/test/samples/Assemble/board_validation/assemble_kernel.cpp index c50cf58e..9fa72d6a 100644 --- a/test/samples/Assemble/board_validation/assemble_kernel.cpp +++ b/test/samples/Assemble/board_validation/assemble_kernel.cpp @@ -52,42 +52,80 @@ struct MrgSortExecutedNumList { #include "pto/pto-inst.hpp" using namespace pto; -__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3) { - unsigned v4 = 1024; - unsigned v5 = 32; - unsigned v6 = 256; - unsigned v7 = 16; - unsigned v8 = 1; - unsigned v9 = 0; - int32_t v10 = 32; - int32_t v11 = 16; - int32_t v12 = 8; - int32_t v13 = 1; - int64_t v14 = 0; - int64_t v15 = 1024; +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ half* v3, __gm__ float* v4) { + unsigned v5 = 1024; + unsigned v6 = 32; + unsigned v7 = 1; + unsigned v8 = 0; + int32_t v9 = 32; + int32_t v10 = 1; + int32_t v11 = 0; + int64_t addr0 = 0; + int64_t addr1 = 4096; + int64_t addr2 = 8192; + int64_t addr3 = 10240; + int64_t addr4 = 12288; + int64_t addr5 = 16384; + int64_t addr6 = 20480; using T = float; - pto::Shape<1, 1, 1, 16, 16> v16 = pto::Shape<1, 1, 1, 16, 16>(); - pto::Stride<256, 256, 256, 16, 1> v17 = pto::Stride<256, 256, 256, 16, 1>(); - GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND> v18 = GlobalTensor, pto::Stride<256, 256, 256, 16, 1>, pto::Layout::ND>(v1 + (v9 + v9 * (unsigned) v11 + v9 * (unsigned) v13), v16, v17); + pto::Shape<1, 1, 1, 32, 32> v16 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v17 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v18 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v1 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v16, v17); pto::Shape<1, 1, 1, 32, 32> v19 = pto::Shape<1, 1, 1, 32, 32>(); pto::Stride<1024, 1024, 1024, 32, 1> v20 = pto::Stride<1024, 1024, 1024, 32, 1>(); - GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v21 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v2 + (v9 + v9 * (unsigned) v10 + v9 * (unsigned) v13), v19, v20); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v21 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v2 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v19, v20); pto::Shape<1, 1, 1, 32, 32> v22 = pto::Shape<1, 1, 1, 32, 32>(); pto::Stride<1024, 1024, 1024, 32, 1> v23 = pto::Stride<1024, 1024, 1024, 32, 1>(); - GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v24 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v3 + (v9 + v9 * (unsigned) v10 + v9 * (unsigned) v13), v22, v23); - Tile v25; - TASSIGN(v25, v14); - Tile v26; - TASSIGN(v26, v15); - TLOAD(v25, v18); - TLOAD(v26, v21); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v24 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v3 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v22, v23); + pto::Shape<1, 1, 1, 32, 32> v25 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v26 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v27 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v4 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v25, v26); + Tile v28; + TASSIGN(v28, addr0); + Tile v29; + TASSIGN(v29, addr1); + Tile v30; + TASSIGN(v30, addr0); + Tile v31; + TASSIGN(v31, addr0); + Tile v32; + TASSIGN(v32, addr0); + Tile v33; + TASSIGN(v33, addr2); + Tile v34; + TASSIGN(v34, addr4); + Tile v35; + TASSIGN(v35, addr3); + Tile v36; + TASSIGN(v36, addr5); + Tile v37; + TASSIGN(v37, addr6); + TLOAD(v28, v18); set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TLOAD(v29, v21); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - TINSERT(v26, v25, v12, v12); - set_flag(PIPE_MTE1, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_MTE3, EVENT_ID0); - TSTORE(v24, v26); + TMOV(v30, v28); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(v31, v29); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + TMATMUL(v32, v30, v31); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TINSERT(v33, v32, v11, v11); + TLOAD(v35, v24); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + pipe_barrier(PIPE_MTE1); + TMOV(v34, v33); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + TMOV(v36, v35); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL(v37, v34, v36); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v27, v37); pipe_barrier(PIPE_ALL); return; } - diff --git a/test/samples/Assemble/board_validation/compare.py b/test/samples/Assemble/board_validation/compare.py index cb360ad7..060dbe3b 100644 --- a/test/samples/Assemble/board_validation/compare.py +++ b/test/samples/Assemble/board_validation/compare.py @@ -155,7 +155,7 @@ def compare_packed_pred_mask(golden_path, output_path, rows, cols): def main(): strict = os.getenv("COMPARE_STRICT", "1") != "0" ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok + ok = compare_bin("golden_v4.bin", "v4.bin", np.float32, 0.0001) and ok if not ok: if strict: print("[ERROR] compare failed") diff --git a/test/samples/Assemble/board_validation/golden.py b/test/samples/Assemble/board_validation/golden.py index 22a1e9fc..01fe4042 100644 --- a/test/samples/Assemble/board_validation/golden.py +++ b/test/samples/Assemble/board_validation/golden.py @@ -7,22 +7,16 @@ def main(): np.random.seed(19) - # Inputs - src = np.random.random(size=(16, 16)).astype(np.float32) - dst = np.random.random(size=(32, 32)).astype(np.float32) - - # Kernel reads an initial output buffer; keep deterministic content. + src_a = np.random.random(size=(32, 32)).astype(np.float32) + src_b = np.random.random(size=(32, 32)).astype(np.float32) + # Identity matrix used by the post-assemble matmul path. + rhs_identity = np.eye(32, dtype=np.float16) out_init = np.zeros((32, 32), dtype=np.float32) - # Golden for TAssemble semantics: - # dst[i + 8, j + 8] = src[i, j] - golden = dst.copy() - golden[8:24, 8:24] = src - - src.tofile("v1.bin") - dst.tofile("v2.bin") - out_init.tofile("v3.bin") - golden.tofile("golden_v3.bin") + src_a.tofile("v1.bin") + src_b.tofile("v2.bin") + rhs_identity.tofile("v3.bin") + out_init.tofile("v4.bin") if __name__ == "__main__": diff --git a/test/samples/Assemble/board_validation/launch.cpp b/test/samples/Assemble/board_validation/launch.cpp index 24196cc0..9da7f7c5 100644 --- a/test/samples/Assemble/board_validation/launch.cpp +++ b/test/samples/Assemble/board_validation/launch.cpp @@ -51,15 +51,15 @@ struct MrgSortExecutedNumList { #endif #if defined(__CCE_AICORE__) -__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3); +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ half* v3, __gm__ float* v4); #else -__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3); +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ half* v3, __gm__ float* v4); #endif -void LaunchAssemble_kernel(float *v1, float *v2, float *v3, void *stream) { +void LaunchAssemble_kernel(float *v1, float *v2, aclFloat16 *v3, float *v4, void *stream) { #if defined(__CCE_AICORE__) - assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ float*)v3); + assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ half*)v3, (__gm__ float*)v4); #else - assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ float*)v3); + assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ half*)v3, (__gm__ float*)v4); #endif } diff --git a/test/samples/Assemble/board_validation/main.cpp b/test/samples/Assemble/board_validation/main.cpp index f4fde8f9..8b74631f 100644 --- a/test/samples/Assemble/board_validation/main.cpp +++ b/test/samples/Assemble/board_validation/main.cpp @@ -41,21 +41,25 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchAssemble_kernel(float *v1, float *v2, float *v3, void *stream); +void LaunchAssemble_kernel(float *v1, float *v2, aclFloat16 *v3, float *v4, void *stream); int main() { - size_t elemCount_v1 = 256; + size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); size_t elemCount_v2 = 1024; size_t fileSize_v2 = elemCount_v2 * sizeof(float); size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); + size_t fileSize_v3 = elemCount_v3 * sizeof(aclFloat16); + size_t elemCount_v4 = 1024; + size_t fileSize_v4 = elemCount_v4 * sizeof(float); float *v1Host = nullptr; float *v1Device = nullptr; float *v2Host = nullptr; float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; + aclFloat16 *v3Host = nullptr; + aclFloat16 *v3Device = nullptr; + float *v4Host = nullptr; + float *v4Device = nullptr; int rc = 0; bool aclInited = false; @@ -75,30 +79,36 @@ int main() { ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); + ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); + ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchAssemble_kernel(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchAssemble_kernel(v1Device, v2Device, v3Device, v4Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + WriteFile("./v4.bin", v4Host, fileSize_v4); cleanup: aclrtFree(v1Device); aclrtFree(v2Device); aclrtFree(v3Device); + aclrtFree(v4Device); aclrtFreeHost(v1Host); aclrtFreeHost(v2Host); aclrtFreeHost(v3Host); + aclrtFreeHost(v4Host); if (stream != nullptr) { const aclError _ret = aclrtDestroyStream(stream); if (_ret != ACL_SUCCESS) { diff --git a/test/samples/Assemble/board_validation/outputs.txt b/test/samples/Assemble/board_validation/outputs.txt index 29ef827e..c694117f 100644 --- a/test/samples/Assemble/board_validation/outputs.txt +++ b/test/samples/Assemble/board_validation/outputs.txt @@ -1 +1 @@ -v3 +v4 diff --git a/test/samples/Assemble/board_validation/run.sh b/test/samples/Assemble/board_validation/run.sh index 945dd54d..ceb3a012 100755 --- a/test/samples/Assemble/board_validation/run.sh +++ b/test/samples/Assemble/board_validation/run.sh @@ -1,28 +1,19 @@ #!/usr/bin/env bash set -euo pipefail -SOC_VERSION="${SOC_VERSION:-Ascend910}" +RUN_MODE="npu" +SOC_VERSION="Ascend910" +GOLDEN_MODE="${GOLDEN_MODE:-sim}" # sim|npu|skip BUILD_DIR="${BUILD_DIR:-build}" +ACL_DEVICE_ID_NPU="${ACL_DEVICE_ID:-}" +ACL_DEVICE_ID_SIM="${ACL_DEVICE_ID_SIM:-0}" + ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${ROOT_DIR}" python3 "${ROOT_DIR}/golden.py" -if [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/cann/set_env.sh" ]]; then - set +e; set +u; set +o pipefail - source "/usr/local/Ascend/cann/set_env.sh" || true - set -o pipefail; set -u; set -e -elif [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then - set +e; set +u; set +o pipefail - source "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" || true - set -o pipefail; set -u; set -e -fi - -if [[ -z "${ASCEND_HOME_PATH:-}" ]]; then - echo "[ERROR] ASCEND_HOME_PATH is not set; please source CANN env first." >&2 - exit 2 -fi - +# Best-effort resolve PTO_ISA_ROOT for generated CMakeLists.txt. if [[ -z "${PTO_ISA_ROOT:-}" ]]; then search_dir="${ROOT_DIR}" for _ in {1..8}; do @@ -30,24 +21,123 @@ if [[ -z "${PTO_ISA_ROOT:-}" ]]; then PTO_ISA_ROOT="${search_dir}/pto-isa" break fi - [[ "${search_dir}" == "/" ]] && break + if [[ "${search_dir}" == "/" ]]; then + break + fi search_dir="$(dirname "${search_dir}")" done export PTO_ISA_ROOT="${PTO_ISA_ROOT:-}" fi -if [[ -z "${PTO_ISA_ROOT:-}" ]]; then - echo "[ERROR] PTO_ISA_ROOT is not set and auto-detect failed." >&2 - exit 2 +# Best-effort load Ascend/CANN environment (toolchains + runtime). Be careful with set -euo pipefail. +if [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/cann/set_env.sh" ]]; then + echo "[INFO] Sourcing /usr/local/Ascend/cann/set_env.sh" + set +e + set +u + set +o pipefail + source "/usr/local/Ascend/cann/set_env.sh" || true + set -o pipefail + set -u + set -e +elif [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then + echo "[INFO] Sourcing /usr/local/Ascend/ascend-toolkit/latest/set_env.sh" + set +e + set +u + set +o pipefail + source "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" || true + set -o pipefail + set -u + set -e +fi + +# Improve runtime linking robustness. +if [[ -n "${ASCEND_HOME_PATH:-}" ]]; then + export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" +fi + +LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH:-}" +LD_LIBRARY_PATH_SIM="${LD_LIBRARY_PATH_NPU}" +if [[ -n "${ASCEND_HOME_PATH:-}" ]]; then + SIM_SOC_VERSION="${SOC_VERSION}" + if [[ "${SOC_VERSION}" == "Ascend910" ]]; then + if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then + SIM_SOC_VERSION="Ascend910A" + elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then + SIM_SOC_VERSION="Ascend910ProA" + fi + fi + + for d in \ + "${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_VERSION}/lib" \ + "${ASCEND_HOME_PATH}/simulator/${SIM_SOC_VERSION}/lib" \ + "${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_VERSION}/lib"; do + [[ -d "$d" ]] && LD_LIBRARY_PATH_SIM="$d:${LD_LIBRARY_PATH_SIM}" + done fi mkdir -p "${ROOT_DIR}/${BUILD_DIR}" -cmake -S "${ROOT_DIR}" -B "${ROOT_DIR}/${BUILD_DIR}" \ - -DSOC_VERSION="${SOC_VERSION}" \ - -DENABLE_SIM_GOLDEN=OFF \ - -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" -cmake --build "${ROOT_DIR}/${BUILD_DIR}" --parallel - -export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" -"${ROOT_DIR}/${BUILD_DIR}/assemble" -python3 "${ROOT_DIR}/compare.py" +cd "${ROOT_DIR}/${BUILD_DIR}" +ENABLE_SIM_GOLDEN="OFF" +[[ "${GOLDEN_MODE}" == "sim" ]] && ENABLE_SIM_GOLDEN="ON" +if [[ -n "${PTO_ISA_ROOT:-}" ]]; then + cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" .. +else + cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" .. +fi +make -j + +cd "${ROOT_DIR}" + +copy_outputs_as_golden() { + if [[ -f "${ROOT_DIR}/outputs.txt" ]]; then + while IFS= read -r name; do + [[ -n "${name}" ]] || continue + cp -f "${ROOT_DIR}/${name}.bin" "${ROOT_DIR}/golden_${name}.bin" + done < "${ROOT_DIR}/outputs.txt" + return 0 + fi + # Fallback: copy every .bin (best-effort). + for f in "${ROOT_DIR}"/*.bin; do + [[ -f "$f" ]] || continue + base="$(basename "$f")" + cp -f "$f" "${ROOT_DIR}/golden_${base}" + done +} + +case "${GOLDEN_MODE}" in + sim) + ACL_DEVICE_ID="${ACL_DEVICE_ID_SIM}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH_SIM}" "${ROOT_DIR}/${BUILD_DIR}/assemble_sim" + copy_outputs_as_golden + if [[ "${RUN_MODE}" == "npu" ]]; then + if [[ -n "${ACL_DEVICE_ID_NPU}" ]]; then + ACL_DEVICE_ID="${ACL_DEVICE_ID_NPU}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + else + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + fi + fi + COMPARE_STRICT=1 python3 "${ROOT_DIR}/compare.py" + ;; + npu) + if [[ "${RUN_MODE}" != "npu" ]]; then + echo "[ERROR] GOLDEN_MODE=npu requires RUN_MODE=npu" >&2 + exit 2 + fi + python3 "${ROOT_DIR}/golden.py" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + copy_outputs_as_golden + python3 "${ROOT_DIR}/golden.py" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + COMPARE_STRICT=1 python3 "${ROOT_DIR}/compare.py" + ;; + skip) + if [[ "${RUN_MODE}" == "npu" ]]; then + python3 "${ROOT_DIR}/golden.py" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + fi + echo "[WARN] compare skipped (GOLDEN_MODE=skip)" + ;; + *) + echo "[ERROR] Unknown GOLDEN_MODE=${GOLDEN_MODE} (expected: sim|npu|skip)" >&2 + exit 2 + ;; +esac From b67d18c95870f2449a0a3cc38547306757c3fb05 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Sun, 15 Mar 2026 09:52:49 +0800 Subject: [PATCH 4/4] chore: rerun CI