diff --git a/docs/PTO_IR_manual.md b/docs/PTO_IR_manual.md index 03a6667d..8c081be7 100644 --- a/docs/PTO_IR_manual.md +++ b/docs/PTO_IR_manual.md @@ -4517,6 +4517,44 @@ pto.treshape ins(%src : !pto.tile_buf<...>) outs(%dst : !pto.tile_buf<...>) --- +##### `pto.tassemble` - Insert Sub-Tile Window + +**Summary:** Inserts a source tile into a destination tile at a given row/col offset. + +**Semantics:** + +``` +dst[i + indexRow, j + indexCol] = src[i, j] +``` + +**Arguments:** + +| Name | Type | Description | +|------|------|-------------| +| `src` | `pto.tile_buf` | Source tile | +| `indexRow` | `Index` | Destination row offset | +| `indexCol` | `Index` | Destination column offset | +| `dst` | `pto.tile_buf` | Destination tile | + +**Results:** None. Writes into `dst` via DPS pattern. + +**Constraints & Verification:** + +- The operation has a custom verifier + +**Hardware Mapping:** + +- Lowers to **`TINSERT(dst, src, indexRow, indexCol)`** +- Uses target data-movement pipeline (MTE1 by default; A5 UB->L1 path uses MTE3) + +**Basic Example:** + +```mlir +pto.tassemble ins(%src, %row, %col : !pto.tile_buf<...>, index, index) outs(%dst : !pto.tile_buf<...>) +``` + +--- + ##### `pto.textract` - Extract Sub-Tile Window **Summary:** Extracts a sub-tile window from a source tile into a destination tile. diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index 267297e2..0b181207 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -2027,6 +2027,50 @@ def TExtractOp : PTO_TOp<"textract", [ }]; } +def TAssembleOp : PTO_TOp<"tassemble", [ + PTO_DpsInitOpInterface, + OpPipeInterface, + DeclareOpInterfaceMethods +]> { + let summary = "Insert src sub-tile into dst at (indexRow, indexCol) (tilebuf, DPS)"; + + let arguments = (ins + PTODpsType:$src, + Index:$indexRow, + Index:$indexCol, + PTODpsType:$dst + ); + + let results = (outs); + + let hasVerifier = 1; + + let assemblyFormat = [{ + `ins` `(` $src `,` $indexRow `,` $indexCol `:` qualified(type($src)) `,` type($indexRow) `,` type($indexCol) `)` + `outs` `(` $dst `:` qualified(type($dst) ) `)` + attr-dict + }]; + + let extraClassDeclaration = [{ + // TINSERT runs on different DMA pipes across targets. + // - A5 (Ascend950/910_95): UB->L1 path is MTE3 in pto-isa custom kernels. + // - Others: keep MTE1 for compatibility with existing data-movement sync. + ::mlir::pto::PIPE getPipe() { + auto moduleOp = getOperation()->getParentOfType<::mlir::ModuleOp>(); + if (moduleOp) { + if (auto spec = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.device-spec")) { + auto s = spec.getValue(); + if (s.starts_with("Ascend950") || s.starts_with("Ascend910_95")) { + return ::mlir::pto::PIPE::PIPE_MTE3; + } + } + } + return ::mlir::pto::PIPE::PIPE_MTE1; + } + ::mlir::MutableOperandRange getDpsInitsMutable() { return getDstMutable(); } + }]; +} + def TFillPadOp : PTO_TOp<"tfillpad", [ PTO_DpsInitOpInterface, OpPipeInterface, diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp index 02670518..2919b860 100644 --- a/lib/PTO/IR/PTO.cpp +++ b/lib/PTO/IR/PTO.cpp @@ -1528,6 +1528,75 @@ mlir::LogicalResult mlir::pto::TExtractOp::verify() { return mlir::success(); } //===----------------------------------------------------------------------===// +// TAssembleOp_DPS verifier +//===----------------------------------------------------------------------===// + +mlir::LogicalResult mlir::pto::TAssembleOp::verify() { + Type srcTy = getSrc().getType(); + Type dstTy = getDst().getType(); + if (!isPTOShapedLike(srcTy) || !isPTOShapedLike(dstTy)) + return emitOpError("expects src/dst to be PTO shaped-like types"); + + auto srcShape = getShapeVec(srcTy); + auto dstShape = getShapeVec(dstTy); + if (srcShape.size() != 2 || dstShape.size() != 2) + return emitOpError("expects rank-2 shaped types for src/dst"); + + Type srcElemTy = getElemTy(srcTy); + Type dstElemTy = getElemTy(dstTy); + bool sameElemTy = srcElemTy == dstElemTy; + bool castElemTy = + srcElemTy.isF32() && (dstElemTy.isF16() || dstElemTy.isBF16()); + if (!sameElemTy && !castElemTy) + return emitOpError( + "expects src/dst element types to match, or src=f32 with dst=f16/bf16"); + + if (!getIndexRow().getType().isIndex() || !getIndexCol().getType().isIndex()) + return emitOpError("expects indexRow/indexCol to be index type"); + + auto readConstIndex = [&](Value v, int64_t &out) -> bool { + if (auto cOp = v.getDefiningOp()) { + out = cOp.value(); + return true; + } + if (auto cInt = v.getDefiningOp()) { + out = cInt.value(); + return true; + } + if (auto cOp = v.getDefiningOp()) { + if (auto ia = mlir::dyn_cast(cOp.getValue())) { + out = ia.getInt(); + return true; + } + } + return false; + }; + + int64_t r0 = 0; + int64_t c0 = 0; + bool rowConst = readConstIndex(getIndexRow(), r0); + bool colConst = readConstIndex(getIndexCol(), c0); + if (rowConst && r0 < 0) + return emitOpError("indexRow must be non-negative"); + if (colConst && c0 < 0) + return emitOpError("indexCol must be non-negative"); + + int64_t srcRows = srcShape[0]; + int64_t srcCols = srcShape[1]; + int64_t dstRows = dstShape[0]; + int64_t dstCols = dstShape[1]; + if (rowConst && srcRows != mlir::ShapedType::kDynamic && + dstRows != mlir::ShapedType::kDynamic && + r0 + srcRows > dstRows) + return emitOpError("indexRow + src rows exceeds dst rows"); + if (colConst && srcCols != mlir::ShapedType::kDynamic && + dstCols != mlir::ShapedType::kDynamic && + c0 + srcCols > dstCols) + return emitOpError("indexCol + src cols exceeds dst cols"); + + return mlir::success(); +} +//===----------------------------------------------------------------------===// // TFillPadOp_DPS verifier //===----------------------------------------------------------------------===// @@ -4282,6 +4351,13 @@ void TExtractOp::getEffects( PTO_ADD_WRITE(getDstMutable()); } +// TASSEMBLE: Read(src) -> Write(dst) +void TAssembleOp::getEffects( + SmallVectorImpl> &effects) { + PTO_ADD_READ(getSrcMutable()); + PTO_ADD_WRITE(getDstMutable()); +} + PTO_DEFINE_UNARY_EFFECTS(TFillPadOp, getSrcMutable(), getDstMutable()) PTO_DEFINE_UNARY_EFFECTS(TFillPadExpandOp, getSrcMutable(), getDstMutable()) diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index 939287dd..5a95250a 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -4743,6 +4743,31 @@ struct PTOExtractToEmitC : public OpConversionPattern { } }; //===----------------------------------------------------------------------===// +// pto.tassemble lowering -> TINSERT(dst, src, indexRow, indexCol) +//===----------------------------------------------------------------------===// + +struct PTOAssembleToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::TAssembleOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + Value src = peelUnrealized(adaptor.getSrc()); + Value dst = peelUnrealized(adaptor.getDst()); + Value r0 = peelUnrealized(adaptor.getIndexRow()); + Value c0 = peelUnrealized(adaptor.getIndexCol()); + + rewriter.create( + loc, TypeRange{}, "TINSERT", + /*args=*/ArrayAttr{}, /*templateArgs=*/ArrayAttr{}, + /*operands=*/ValueRange{dst, src, r0, c0}); + + rewriter.eraseOp(op); + return success(); + } +}; +//===----------------------------------------------------------------------===// // pto.tfillpad lowering -> TFILLPAD(dst, src) //===----------------------------------------------------------------------===// @@ -7298,7 +7323,7 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); - patterns.add(typeConverter, ctx); + patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); patterns.add(typeConverter, ctx); diff --git a/test/samples/Assemble/assemble.golden b/test/samples/Assemble/assemble.golden new file mode 100644 index 00000000..46a9385c --- /dev/null +++ b/test/samples/Assemble/assemble.golden @@ -0,0 +1,4 @@ +# Regex patterns that must appear in emitted C++ for assemble.py +TLOAD\( +TINSERT\( +TSTORE\( diff --git a/test/samples/Assemble/assemble.py b/test/samples/Assemble/assemble.py new file mode 100644 index 00000000..bb3de44e --- /dev/null +++ b/test/samples/Assemble/assemble.py @@ -0,0 +1,148 @@ +from mlir.ir import Context, Location, Module, InsertionPoint +from mlir.dialects import func, arith, pto +from mlir.ir import F16Type, F32Type, IndexType + + +def build(): + with Context() as ctx: + pto.register_dialect(ctx, load=True) + + with Location.unknown(ctx): + m = Module.create() + + f16 = F16Type.get(ctx) + f32 = F32Type.get(ctx) + idx = IndexType.get(ctx) + + ptr_f32 = pto.PtrType.get(f32, ctx) + ptr_f16 = pto.PtrType.get(f16, ctx) + + tv2_f32 = pto.TensorViewType.get(2, f32, ctx) + tv2_f16 = pto.TensorViewType.get(2, f16, ctx) + + tile_view_f32 = pto.PartitionTensorViewType.get([32, 32], f32, ctx) + tile_view_f16 = pto.PartitionTensorViewType.get([32, 32], f16, ctx) + + mat = pto.AddressSpaceAttr.get(pto.AddressSpace.MAT, ctx) + left = pto.AddressSpaceAttr.get(pto.AddressSpace.LEFT, ctx) + right = pto.AddressSpaceAttr.get(pto.AddressSpace.RIGHT, ctx) + acc = pto.AddressSpaceAttr.get(pto.AddressSpace.ACC, ctx) + pd = pto.PadValueAttr.get(pto.PadValue.Null, ctx) + + cfg_mat_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_left_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_right_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.ColMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_acc_f32 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + 1024, + pd, + ctx, + ) + cfg_mat_f16 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_left_f16 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.ColMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.RowMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + cfg_right_f16 = pto.TileBufConfigAttr.get( + pto.BLayoutAttr.get(pto.BLayout.RowMajor, ctx), + pto.SLayoutAttr.get(pto.SLayout.ColMajor, ctx), + pto.TileConfig.fractalABSize, + pd, + ctx, + ) + + a_mat_ty = pto.TileBufType.get([32, 32], f32, mat, [32, 32], cfg_mat_f32, ctx) + b_mat_ty = pto.TileBufType.get([32, 32], f32, mat, [32, 32], cfg_mat_f32, ctx) + a_left_ty = pto.TileBufType.get([32, 32], f32, left, [32, 32], cfg_left_f32, ctx) + b_right_ty = pto.TileBufType.get([32, 32], f32, right, [32, 32], cfg_right_f32, ctx) + src_acc_ty = pto.TileBufType.get([32, 32], f32, acc, [32, 32], cfg_acc_f32, ctx) + dst_mat_ty = pto.TileBufType.get([32, 32], f16, mat, [32, 32], cfg_mat_f16, ctx) + out_left_ty = pto.TileBufType.get([32, 32], f16, left, [32, 32], cfg_left_f16, ctx) + i_mat_ty = pto.TileBufType.get([32, 32], f16, mat, [32, 32], cfg_mat_f16, ctx) + i_right_ty = pto.TileBufType.get([32, 32], f16, right, [32, 32], cfg_right_f16, ctx) + out_acc_ty = pto.TileBufType.get([32, 32], f32, acc, [32, 32], cfg_acc_f32, ctx) + + fn_ty = func.FunctionType.get([ptr_f32, ptr_f32, ptr_f16, ptr_f32], []) + with InsertionPoint(m.body): + fn = func.FuncOp("assemble_kernel", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + c0 = arith.ConstantOp(idx, 0).result + c1 = arith.ConstantOp(idx, 1).result + c32 = arith.ConstantOp(idx, 32).result + + arg_a, arg_b, arg_i, arg_out = entry.arguments + + tv_a = pto.MakeTensorViewOp(tv2_f32, arg_a, [c32, c32], [c32, c1]).result + tv_b = pto.MakeTensorViewOp(tv2_f32, arg_b, [c32, c32], [c32, c1]).result + tv_i = pto.MakeTensorViewOp(tv2_f16, arg_i, [c32, c32], [c32, c1]).result + tv_out = pto.MakeTensorViewOp(tv2_f32, arg_out, [c32, c32], [c32, c1]).result + + sv_a = pto.PartitionViewOp(tile_view_f32, tv_a, offsets=[c0, c0], sizes=[c32, c32]).result + sv_b = pto.PartitionViewOp(tile_view_f32, tv_b, offsets=[c0, c0], sizes=[c32, c32]).result + sv_i = pto.PartitionViewOp(tile_view_f16, tv_i, offsets=[c0, c0], sizes=[c32, c32]).result + sv_out = pto.PartitionViewOp(tile_view_f32, tv_out, offsets=[c0, c0], sizes=[c32, c32]).result + + a_mat = pto.AllocTileOp(a_mat_ty).result + b_mat = pto.AllocTileOp(b_mat_ty).result + a_left = pto.AllocTileOp(a_left_ty).result + b_right = pto.AllocTileOp(b_right_ty).result + src_acc = pto.AllocTileOp(src_acc_ty).result + dst_mat = pto.AllocTileOp(dst_mat_ty).result + out_left = pto.AllocTileOp(out_left_ty).result + i_mat = pto.AllocTileOp(i_mat_ty).result + i_right = pto.AllocTileOp(i_right_ty).result + out_acc = pto.AllocTileOp(out_acc_ty).result + + pto.TLoadOp(None, sv_a, a_mat) + pto.TLoadOp(None, sv_b, b_mat) + pto.TMovOp(None, a_mat, a_left) + pto.TMovOp(None, b_mat, b_right) + pto.TMatmulOp(None, a_left, b_right, src_acc) + + # Main operation under test: lowering must emit TINSERT(dst, src, row, col). + pto.TAssembleOp(src_acc, c0, c0, dst_mat) + + pto.TLoadOp(None, sv_i, i_mat) + pto.TMovOp(None, dst_mat, out_left) + pto.TMovOp(None, i_mat, i_right) + pto.TMatmulOp(None, out_left, i_right, out_acc) + pto.TStoreOp(None, out_acc, sv_out) + + func.ReturnOp([]) + + m.operation.verify() + return m + + +if __name__ == "__main__": + print(build()) diff --git a/test/samples/Assemble/board_validation/CMakeLists.txt b/test/samples/Assemble/board_validation/CMakeLists.txt new file mode 100644 index 00000000..18f73df7 --- /dev/null +++ b/test/samples/Assemble/board_validation/CMakeLists.txt @@ -0,0 +1,129 @@ +cmake_minimum_required(VERSION 3.16) + +# Prefer setting compilers before project() so CMake picks up bisheng correctly. +set(CMAKE_C_COMPILER bisheng) +set(CMAKE_CXX_COMPILER bisheng) + +project(assemble_npu_validation) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +if(NOT DEFINED SOC_VERSION) + set(SOC_VERSION Ascend910) +endif() +option(ENABLE_SIM_GOLDEN "Build Ascend simulator (camodel) executable" ON) + +if(NOT DEFINED ENV{ASCEND_HOME_PATH}) + message(FATAL_ERROR "Cannot find ASCEND_HOME_PATH, please source the CANN set_env.sh.") +else() + set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) +endif() + +set(PTO_ISA_ROOT "" CACHE PATH "Path to pto-isa repo") +if(NOT PTO_ISA_ROOT) + set(_PTO_ISA_CANDIDATES + "${CMAKE_CURRENT_LIST_DIR}/../../../../pto-isa" + "${CMAKE_CURRENT_LIST_DIR}/../../../../../pto-isa" + "${CMAKE_CURRENT_LIST_DIR}/../../../../../../pto-isa" + ) + foreach(_cand IN LISTS _PTO_ISA_CANDIDATES) + if(EXISTS "${_cand}/include" AND EXISTS "${_cand}/tests/common") + set(PTO_ISA_ROOT "${_cand}" CACHE PATH "Path to pto-isa repo" FORCE) + break() + endif() + endforeach() +endif() +if(NOT PTO_ISA_ROOT) + message(FATAL_ERROR "Cannot find PTO_ISA_ROOT, please pass -DPTO_ISA_ROOT=/path/to/pto-isa.") +endif() + +set(ASCEND_DRIVER_PATH /usr/local/Ascend/driver) + +add_compile_options( + -D_FORTIFY_SOURCE=2 + -O2 -std=c++17 + -Wno-macro-redefined -Wno-ignored-attributes + -fstack-protector-strong + -fPIC +) +add_link_options( + -s + -Wl,-z,relro + -Wl,-z,now +) + + set(CMAKE_CCE_COMPILE_OPTIONS + -xcce + -fenable-matrix + --cce-aicore-enable-tl + + + -fPIC + -Xhost-start -Xhost-end + "SHELL:-mllvm -cce-aicore-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-function-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-record-overflow=true" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm -cce-aicore-dcci-insert-for-scalar=false" +) + +set(CMAKE_CPP_COMPILE_OPTIONS + -xc++ + "SHELL:-include stdint.h" + "SHELL:-include stddef.h" +) + +include_directories( + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common + ${ASCEND_HOME_PATH}/include + ${ASCEND_DRIVER_PATH}/kernel/inc +) + + add_library(assemble_kernel SHARED assemble_kernel.cpp launch.cpp) + target_compile_options(assemble_kernel PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --cce-aicore-arch=dav-c220-cube -DMEMORY_BASE -std=c++17) + target_include_directories(assemble_kernel PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/ + ${ASCEND_HOME_PATH}/pkg_inc/profiling/ + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime + ) +target_link_options(assemble_kernel PRIVATE --cce-fatobj-link) + +add_executable(assemble main.cpp) +target_compile_options(assemble PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) +target_include_directories(assemble PRIVATE + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common +) + +target_link_directories(assemble PUBLIC + ${ASCEND_HOME_PATH}/lib64 +) + +target_link_libraries(assemble PRIVATE + assemble_kernel + runtime + stdc++ ascendcl m tiling_api platform c_sec dl nnopbase +) + +if(ENABLE_SIM_GOLDEN) + # Simulator executable: used to generate golden outputs (Ascend camodel). + add_executable(assemble_sim main.cpp) + target_compile_options(assemble_sim PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) + target_include_directories(assemble_sim PRIVATE + ${PTO_ISA_ROOT}/include + ${PTO_ISA_ROOT}/tests/common + ) + target_link_directories(assemble_sim PUBLIC + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SOC_VERSION}/lib + ${ASCEND_HOME_PATH}/simulator/${SOC_VERSION}/lib + ${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib + ) + target_link_libraries(assemble_sim PRIVATE + assemble_kernel + runtime_camodel + stdc++ ascendcl m tiling_api platform c_sec dl nnopbase + ) +endif() diff --git a/test/samples/Assemble/board_validation/README.md b/test/samples/Assemble/board_validation/README.md new file mode 100644 index 00000000..ccb99090 --- /dev/null +++ b/test/samples/Assemble/board_validation/README.md @@ -0,0 +1,34 @@ +# Assemble NPU Validation (A3) + +This test validates `pto.tassemble` functional correctness on board. + +Semantics under test (A3-legal path): + +1. `acc0 = matmul(v1, v2)` in `ACC` +2. `tassemble(acc0 -> MAT, indexRow=0, indexCol=0)` +3. `acc1 = matmul(MAT_after_assemble, v3_identity)` +4. store `acc1` to `v4` + +Golden reference: + +- default flow uses `assemble_sim` (Ascend simulator) to produce `golden_v4.bin` +- then runs NPU kernel and compares `v4.bin` vs `golden_v4.bin` + +Inputs/outputs: + +- `v1.bin`: input A (`32x32`, `float32`) +- `v2.bin`: input B (`32x32`, `float32`) +- `v3.bin`: identity matrix (`32x32`, `float16`) +- `v4.bin`: output tile (`32x32`, `float32`) +- `golden_v4.bin`: expected output generated by simulator golden + +Run on A3: + +```bash +cd test/samples/Assemble/board_validation +SOC_VERSION=Ascend910 ./run.sh +``` + +A passing run prints: + +`[INFO] compare passed` diff --git a/test/samples/Assemble/board_validation/assemble_kernel.cpp b/test/samples/Assemble/board_validation/assemble_kernel.cpp new file mode 100644 index 00000000..9fa72d6a --- /dev/null +++ b/test/samples/Assemble/board_validation/assemble_kernel.cpp @@ -0,0 +1,131 @@ +// --------------------------------------------------------------------------- +// PTOAS compatibility layer +// +// The upstream pto-isa headers reference some FP8/FP4 types and the +// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain +// combination (e.g. __NPU_ARCH__==2201). +// +// For our PTOAS-generated kernels we don't rely on these types today, but the +// headers still mention them in templates/static_asserts. Provide minimal +// fallbacks to keep compilation working on dav-c220. +// --------------------------------------------------------------------------- +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif + +#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) +typedef struct { unsigned char v; } hifloat8_t; +typedef struct { unsigned char v; } float8_e4m3_t; +typedef struct { unsigned char v; } float8_e5m2_t; +typedef struct { unsigned char v; } float8_e8m0_t; +typedef struct { unsigned char v; } float4_e1m2x2_t; +typedef struct { unsigned char v; } float4_e2m1x2_t; +#endif +#include + +// AICore printf support is gated behind `--cce-enable-print` on some +// toolchains. When enabled, include the CCE print header so `cce::printf` +// resolves in device compilation. +#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) +#include +#endif +#include +#include + +// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation +// path, but `bisheng -xcce` still performs a host-side parse pass. +// Provide minimal fallbacks only when the corresponding header wasn't +// pulled in by the selected arch implementation. +#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif + +#include "pto/pto-inst.hpp" +using namespace pto; +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ half* v3, __gm__ float* v4) { + unsigned v5 = 1024; + unsigned v6 = 32; + unsigned v7 = 1; + unsigned v8 = 0; + int32_t v9 = 32; + int32_t v10 = 1; + int32_t v11 = 0; + int64_t addr0 = 0; + int64_t addr1 = 4096; + int64_t addr2 = 8192; + int64_t addr3 = 10240; + int64_t addr4 = 12288; + int64_t addr5 = 16384; + int64_t addr6 = 20480; + using T = float; + pto::Shape<1, 1, 1, 32, 32> v16 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v17 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v18 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v1 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v16, v17); + pto::Shape<1, 1, 1, 32, 32> v19 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v20 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v21 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v2 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v19, v20); + pto::Shape<1, 1, 1, 32, 32> v22 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v23 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v24 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v3 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v22, v23); + pto::Shape<1, 1, 1, 32, 32> v25 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<1024, 1024, 1024, 32, 1> v26 = pto::Stride<1024, 1024, 1024, 32, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND> v27 = GlobalTensor, pto::Stride<1024, 1024, 1024, 32, 1>, pto::Layout::ND>(v4 + (v8 + v8 * (unsigned) v9 + v8 * (unsigned) v10), v25, v26); + Tile v28; + TASSIGN(v28, addr0); + Tile v29; + TASSIGN(v29, addr1); + Tile v30; + TASSIGN(v30, addr0); + Tile v31; + TASSIGN(v31, addr0); + Tile v32; + TASSIGN(v32, addr0); + Tile v33; + TASSIGN(v33, addr2); + Tile v34; + TASSIGN(v34, addr4); + Tile v35; + TASSIGN(v35, addr3); + Tile v36; + TASSIGN(v36, addr5); + Tile v37; + TASSIGN(v37, addr6); + TLOAD(v28, v18); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TLOAD(v29, v21); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(v30, v28); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(v31, v29); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + TMATMUL(v32, v30, v31); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TINSERT(v33, v32, v11, v11); + TLOAD(v35, v24); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + pipe_barrier(PIPE_MTE1); + TMOV(v34, v33); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + TMOV(v36, v35); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL(v37, v34, v36); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v27, v37); + pipe_barrier(PIPE_ALL); + return; +} diff --git a/test/samples/Assemble/board_validation/compare.py b/test/samples/Assemble/board_validation/compare.py new file mode 100644 index 00000000..060dbe3b --- /dev/null +++ b/test/samples/Assemble/board_validation/compare.py @@ -0,0 +1,169 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import os +import sys +import numpy as np + + +def compare_bin(golden_path, output_path, dtype, eps): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np) + output = np.fromfile(output_path, dtype=dtype_np) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + g = golden.astype(np.int64, copy=False) + o = output.astype(np.int64, copy=False) + else: + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" + ) + else: + print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_bin_prefix(golden_path, output_path, dtype, eps, count): + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + try: + count = int(count) + except Exception: + print(f"[ERROR] Invalid prefix count: {count}") + return False + if count <= 0: + print(f"[ERROR] Invalid prefix count: {count}") + return False + + dtype_np = np.dtype(dtype) + golden = np.fromfile(golden_path, dtype=dtype_np, count=count) + output = np.fromfile(output_path, dtype=dtype_np, count=count) + + if golden.size != count or output.size != count: + print( + f"[ERROR] Prefix read too small: need={count} elems, " + f"golden={golden.size}, out={output.size}" + ) + return False + + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + if golden.size: + if np.issubdtype(dtype_np, np.floating): + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): + g = golden.astype(np.int64, copy=False) + o = output.astype(np.int64, copy=False) + else: + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + diff = float(abs_diff[idx]) + print( + f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " + f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" + ) + else: + print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") + return False + return True + + +def compare_packed_pred_mask(golden_path, output_path, rows, cols): + """ + Compare outputs of pto.tcmp / pto.tcmps. + + These ops produce a *packed predicate mask* and do not define every byte in + the logical u8 tile buffer. In practice, only the first N bytes of each row + are meaningful (packed as 64-bit chunks). Ignore the rest to avoid flaky + compares caused by undefined bytes. + """ + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + try: + rows = int(rows) + cols = int(cols) + except Exception: + print(f"[ERROR] Invalid rows/cols for packed mask compare: rows={rows} cols={cols}") + return False + if rows <= 0 or cols <= 0: + print(f"[ERROR] Invalid rows/cols for packed mask compare: rows={rows} cols={cols}") + return False + + golden = np.fromfile(golden_path, dtype=np.uint8) + output = np.fromfile(output_path, dtype=np.uint8) + + need = rows * cols + if golden.size < need or output.size < need: + print( + f"[ERROR] Packed mask buffer too small: need={need} bytes, " + f"golden={golden.size}, out={output.size}" + ) + return False + + golden = golden[:need].reshape(rows, cols) + output = output[:need].reshape(rows, cols) + + # Packed mask layout: 1 predicate bit per element, packed into 64-bit words + # per row (so 8 bytes per 64 columns). For cols <= 64 we still use one word. + row_bytes = ((cols + 63) // 64) * 8 + row_bytes = min(row_bytes, cols) + + golden_sel = golden[:, :row_bytes].reshape(-1) + output_sel = output[:, :row_bytes].reshape(-1) + + if not np.array_equal(golden_sel, output_sel): + diff = np.nonzero(golden_sel != output_sel)[0] + idx = int(diff[0]) if diff.size else 0 + print( + f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " + f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, rows={rows}, cols={cols}, row_bytes={row_bytes})" + ) + return False + return True + + +def main(): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + ok = True + ok = compare_bin("golden_v4.bin", "v4.bin", np.float32, 0.0001) and ok + if not ok: + if strict: + print("[ERROR] compare failed") + sys.exit(2) + print("[WARN] compare failed (non-gating)") + return + print("[INFO] compare passed") + + +if __name__ == "__main__": + main() diff --git a/test/samples/Assemble/board_validation/golden.py b/test/samples/Assemble/board_validation/golden.py new file mode 100644 index 00000000..01fe4042 --- /dev/null +++ b/test/samples/Assemble/board_validation/golden.py @@ -0,0 +1,23 @@ +#!/usr/bin/python3 +# coding=utf-8 + +import numpy as np + + +def main(): + np.random.seed(19) + + src_a = np.random.random(size=(32, 32)).astype(np.float32) + src_b = np.random.random(size=(32, 32)).astype(np.float32) + # Identity matrix used by the post-assemble matmul path. + rhs_identity = np.eye(32, dtype=np.float16) + out_init = np.zeros((32, 32), dtype=np.float32) + + src_a.tofile("v1.bin") + src_b.tofile("v2.bin") + rhs_identity.tofile("v3.bin") + out_init.tofile("v4.bin") + + +if __name__ == "__main__": + main() diff --git a/test/samples/Assemble/board_validation/launch.cpp b/test/samples/Assemble/board_validation/launch.cpp new file mode 100644 index 00000000..9da7f7c5 --- /dev/null +++ b/test/samples/Assemble/board_validation/launch.cpp @@ -0,0 +1,65 @@ +// --------------------------------------------------------------------------- +// PTOAS compatibility layer +// +// The upstream pto-isa headers reference some FP8/FP4 types and the +// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain +// combination (e.g. __NPU_ARCH__==2201). +// +// For our PTOAS-generated kernels we don't rely on these types today, but the +// headers still mention them in templates/static_asserts. Provide minimal +// fallbacks to keep compilation working on dav-c220. +// --------------------------------------------------------------------------- +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif + +#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) +typedef struct { unsigned char v; } hifloat8_t; +typedef struct { unsigned char v; } float8_e4m3_t; +typedef struct { unsigned char v; } float8_e5m2_t; +typedef struct { unsigned char v; } float8_e8m0_t; +typedef struct { unsigned char v; } float4_e1m2x2_t; +typedef struct { unsigned char v; } float4_e2m1x2_t; +#endif +#include + +// AICore printf support is gated behind `--cce-enable-print` on some +// toolchains. When enabled, include the CCE print header so `cce::printf` +// resolves in device compilation. +#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) +#include +#endif +#include +#include + +// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation +// path, but `bisheng -xcce` still performs a host-side parse pass. +// Provide minimal fallbacks only when the corresponding header wasn't +// pulled in by the selected arch implementation. +#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif + +#if defined(__CCE_AICORE__) +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ half* v3, __gm__ float* v4); +#else +__global__ AICORE void assemble_kernel(__gm__ float* v1, __gm__ float* v2, __gm__ half* v3, __gm__ float* v4); +#endif + +void LaunchAssemble_kernel(float *v1, float *v2, aclFloat16 *v3, float *v4, void *stream) { +#if defined(__CCE_AICORE__) + assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ half*)v3, (__gm__ float*)v4); +#else + assemble_kernel<<<1, nullptr, stream>>>((__gm__ float*)v1, (__gm__ float*)v2, (__gm__ half*)v3, (__gm__ float*)v4); +#endif +} diff --git a/test/samples/Assemble/board_validation/main.cpp b/test/samples/Assemble/board_validation/main.cpp new file mode 100644 index 00000000..8b74631f --- /dev/null +++ b/test/samples/Assemble/board_validation/main.cpp @@ -0,0 +1,136 @@ +/** +Copyright (c) 2025 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#include "test_common.h" +#include "acl/acl.h" +#include +#include +#include + +using namespace PtoTestCommon; + +#ifndef TMRGSORT_HPP +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif + +#define ACL_CHECK(expr) \ + do { \ + const aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ + const char *_recent = aclGetRecentErrMsg(); \ + if (_recent != nullptr && _recent[0] != '\0') { \ + std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ + } \ + rc = 1; \ + goto cleanup; \ + } \ + } while (0) + +void LaunchAssemble_kernel(float *v1, float *v2, aclFloat16 *v3, float *v4, void *stream); + +int main() { + size_t elemCount_v1 = 1024; + size_t fileSize_v1 = elemCount_v1 * sizeof(float); + size_t elemCount_v2 = 1024; + size_t fileSize_v2 = elemCount_v2 * sizeof(float); + size_t elemCount_v3 = 1024; + size_t fileSize_v3 = elemCount_v3 * sizeof(aclFloat16); + size_t elemCount_v4 = 1024; + size_t fileSize_v4 = elemCount_v4 * sizeof(float); + float *v1Host = nullptr; + float *v1Device = nullptr; + float *v2Host = nullptr; + float *v2Device = nullptr; + aclFloat16 *v3Host = nullptr; + aclFloat16 *v3Device = nullptr; + float *v4Host = nullptr; + float *v4Device = nullptr; + + int rc = 0; + bool aclInited = false; + bool deviceSet = false; + int deviceId = 0; + aclrtStream stream = nullptr; + + ACL_CHECK(aclInit(nullptr)); + aclInited = true; + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + ACL_CHECK(aclrtSetDevice(deviceId)); + deviceSet = true; + ACL_CHECK(aclrtCreateStream(&stream)); + + ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); + ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); + ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); + ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); + ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); + ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); + ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); + ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); + ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchAssemble_kernel(v1Device, v2Device, v3Device, v4Device, stream); + + ACL_CHECK(aclrtSynchronizeStream(stream)); + ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, ACL_MEMCPY_DEVICE_TO_HOST)); + + WriteFile("./v4.bin", v4Host, fileSize_v4); + +cleanup: + aclrtFree(v1Device); + aclrtFree(v2Device); + aclrtFree(v3Device); + aclrtFree(v4Device); + aclrtFreeHost(v1Host); + aclrtFreeHost(v2Host); + aclrtFreeHost(v3Host); + aclrtFreeHost(v4Host); + if (stream != nullptr) { + const aclError _ret = aclrtDestroyStream(stream); + if (_ret != ACL_SUCCESS) { + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", + "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); + } + stream = nullptr; + } + if (deviceSet) { + const aclError _ret = aclrtResetDevice(deviceId); + if (_ret != ACL_SUCCESS) { + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", + "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); + } + } + if (aclInited) { + const aclError _ret = aclFinalize(); + if (_ret != ACL_SUCCESS) { + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", + "aclFinalize()", (int)_ret, __FILE__, __LINE__); + } + } + + return rc; +} diff --git a/test/samples/Assemble/board_validation/outputs.txt b/test/samples/Assemble/board_validation/outputs.txt new file mode 100644 index 00000000..c694117f --- /dev/null +++ b/test/samples/Assemble/board_validation/outputs.txt @@ -0,0 +1 @@ +v4 diff --git a/test/samples/Assemble/board_validation/run.sh b/test/samples/Assemble/board_validation/run.sh new file mode 100755 index 00000000..ceb3a012 --- /dev/null +++ b/test/samples/Assemble/board_validation/run.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +set -euo pipefail + +RUN_MODE="npu" +SOC_VERSION="Ascend910" +GOLDEN_MODE="${GOLDEN_MODE:-sim}" # sim|npu|skip +BUILD_DIR="${BUILD_DIR:-build}" +ACL_DEVICE_ID_NPU="${ACL_DEVICE_ID:-}" +ACL_DEVICE_ID_SIM="${ACL_DEVICE_ID_SIM:-0}" + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +cd "${ROOT_DIR}" +python3 "${ROOT_DIR}/golden.py" + +# Best-effort resolve PTO_ISA_ROOT for generated CMakeLists.txt. +if [[ -z "${PTO_ISA_ROOT:-}" ]]; then + search_dir="${ROOT_DIR}" + for _ in {1..8}; do + if [[ -d "${search_dir}/pto-isa/include" && -d "${search_dir}/pto-isa/tests/common" ]]; then + PTO_ISA_ROOT="${search_dir}/pto-isa" + break + fi + if [[ "${search_dir}" == "/" ]]; then + break + fi + search_dir="$(dirname "${search_dir}")" + done + export PTO_ISA_ROOT="${PTO_ISA_ROOT:-}" +fi + +# Best-effort load Ascend/CANN environment (toolchains + runtime). Be careful with set -euo pipefail. +if [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/cann/set_env.sh" ]]; then + echo "[INFO] Sourcing /usr/local/Ascend/cann/set_env.sh" + set +e + set +u + set +o pipefail + source "/usr/local/Ascend/cann/set_env.sh" || true + set -o pipefail + set -u + set -e +elif [[ -z "${ASCEND_HOME_PATH:-}" && -f "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" ]]; then + echo "[INFO] Sourcing /usr/local/Ascend/ascend-toolkit/latest/set_env.sh" + set +e + set +u + set +o pipefail + source "/usr/local/Ascend/ascend-toolkit/latest/set_env.sh" || true + set -o pipefail + set -u + set -e +fi + +# Improve runtime linking robustness. +if [[ -n "${ASCEND_HOME_PATH:-}" ]]; then + export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" +fi + +LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH:-}" +LD_LIBRARY_PATH_SIM="${LD_LIBRARY_PATH_NPU}" +if [[ -n "${ASCEND_HOME_PATH:-}" ]]; then + SIM_SOC_VERSION="${SOC_VERSION}" + if [[ "${SOC_VERSION}" == "Ascend910" ]]; then + if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then + SIM_SOC_VERSION="Ascend910A" + elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then + SIM_SOC_VERSION="Ascend910ProA" + fi + fi + + for d in \ + "${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_VERSION}/lib" \ + "${ASCEND_HOME_PATH}/simulator/${SIM_SOC_VERSION}/lib" \ + "${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_VERSION}/lib"; do + [[ -d "$d" ]] && LD_LIBRARY_PATH_SIM="$d:${LD_LIBRARY_PATH_SIM}" + done +fi + +mkdir -p "${ROOT_DIR}/${BUILD_DIR}" +cd "${ROOT_DIR}/${BUILD_DIR}" +ENABLE_SIM_GOLDEN="OFF" +[[ "${GOLDEN_MODE}" == "sim" ]] && ENABLE_SIM_GOLDEN="ON" +if [[ -n "${PTO_ISA_ROOT:-}" ]]; then + cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" .. +else + cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" .. +fi +make -j + +cd "${ROOT_DIR}" + +copy_outputs_as_golden() { + if [[ -f "${ROOT_DIR}/outputs.txt" ]]; then + while IFS= read -r name; do + [[ -n "${name}" ]] || continue + cp -f "${ROOT_DIR}/${name}.bin" "${ROOT_DIR}/golden_${name}.bin" + done < "${ROOT_DIR}/outputs.txt" + return 0 + fi + # Fallback: copy every .bin (best-effort). + for f in "${ROOT_DIR}"/*.bin; do + [[ -f "$f" ]] || continue + base="$(basename "$f")" + cp -f "$f" "${ROOT_DIR}/golden_${base}" + done +} + +case "${GOLDEN_MODE}" in + sim) + ACL_DEVICE_ID="${ACL_DEVICE_ID_SIM}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH_SIM}" "${ROOT_DIR}/${BUILD_DIR}/assemble_sim" + copy_outputs_as_golden + if [[ "${RUN_MODE}" == "npu" ]]; then + if [[ -n "${ACL_DEVICE_ID_NPU}" ]]; then + ACL_DEVICE_ID="${ACL_DEVICE_ID_NPU}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + else + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + fi + fi + COMPARE_STRICT=1 python3 "${ROOT_DIR}/compare.py" + ;; + npu) + if [[ "${RUN_MODE}" != "npu" ]]; then + echo "[ERROR] GOLDEN_MODE=npu requires RUN_MODE=npu" >&2 + exit 2 + fi + python3 "${ROOT_DIR}/golden.py" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + copy_outputs_as_golden + python3 "${ROOT_DIR}/golden.py" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + COMPARE_STRICT=1 python3 "${ROOT_DIR}/compare.py" + ;; + skip) + if [[ "${RUN_MODE}" == "npu" ]]; then + python3 "${ROOT_DIR}/golden.py" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH_NPU}" "${ROOT_DIR}/${BUILD_DIR}/assemble" + fi + echo "[WARN] compare skipped (GOLDEN_MODE=skip)" + ;; + *) + echo "[ERROR] Unknown GOLDEN_MODE=${GOLDEN_MODE} (expected: sim|npu|skip)" >&2 + exit 2 + ;; +esac diff --git a/test/samples/runop.sh b/test/samples/runop.sh index ee50e931..bbdb4428 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -463,6 +463,29 @@ PY fi fi + if [[ "$base" == "assemble" ]]; then + local golden_file="${dir}/assemble.golden" + local assemble_ok=1 + if [[ ! -f "${golden_file}" ]]; then + echo -e "${A}(${base}.py)\tFAIL\tmissing golden ref: ${golden_file}" + overall=1 + continue + fi + while IFS= read -r pat || [[ -n "$pat" ]]; do + [[ -n "$pat" ]] || continue + [[ "$pat" =~ ^# ]] && continue + if ! grep -Eq "$pat" "$cpp"; then + echo -e "${A}(${base}.py)\tFAIL\tgolden mismatch: missing pattern '$pat'" + overall=1 + assemble_ok=0 + break + fi + done < "${golden_file}" + if [[ ${assemble_ok} -eq 0 ]]; then + continue + fi + fi + if [[ "$base" == "fillpad" ]]; then if ! grep -Fq "TFILLPAD(" "$cpp"; then echo -e "${A}(${base}.py)\tFAIL\tmissing TFILLPAD() lowering for pto.tfillpad"