Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ python3 ./tmatmulk.py > ./tmatmulk.pto
python3 test/npu_validation/scripts/generate_testcase.py \
--input test/samples/Abs/abs-pto.cpp \
--run-mode npu \
--soc-version Ascend910B1
--pto-arch a3

# 2) 运行验证(run.sh 无需额外参数)
test/samples/Abs/npu_validation/run.sh
Expand Down
11 changes: 5 additions & 6 deletions include/PTO/IR/PTOOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -3133,18 +3133,17 @@ def TScatterOp: PTO_TOp<"tscatter", [

let extraClassDeclaration = [{
::mlir::pto::PIPE getPipe() {
// NOTE: On dav-c220 (Ascend910 A2/A3), pto-isa implements TSCATTER as a
// NOTE: On A2/A3 (--pto-arch=a3), pto-isa implements TSCATTER as a
// scalar loop over UB pointers, which executes on the scalar pipeline
// (PIPE_S). Waiting on PIPE_V does not block scalar UB accesses and can
// lead to using uninitialized indices/data (crash / aivec exception).
//
// On A5 instruction set devices, TSCATTER is implemented with vector
// scatter instructions and should be treated as PIPE_V.
// On A5 instruction set devices (--pto-arch=a5), TSCATTER is implemented
// with vector scatter instructions and should be treated as PIPE_V.
auto moduleOp = getOperation()->getParentOfType<::mlir::ModuleOp>();
if (moduleOp) {
if (auto spec = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.device-spec")) {
auto s = spec.getValue();
if (s.starts_with("Ascend950") || s.starts_with("Ascend910_95")) {
if (auto arch = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.target_arch")) {
if (arch.getValue().equals_insensitive("a5")) {
return ::mlir::pto::PIPE::PIPE_V;
}
}
Expand Down
69 changes: 41 additions & 28 deletions test/npu_validation/scripts/generate_testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,11 +332,11 @@ def _inject_packed_pred_mask_preload(
return kernel_text[:insert_at] + block + kernel_text[insert_at:]


def _infer_aicore_arch(kernel_text: str, soc_version: str) -> str:
def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str:
# Heuristic: kernels that touch cube/L0/L1 tile types or cbuf memories need
# the "cube" arch; pure vector kernels can use the vector arch.
#
# IMPORTANT: the default arch depends on the Ascend SoC.
# IMPORTANT: the default arch depends on the target architecture.
cube_markers = (
"TileType::Mat",
"TileType::Left",
Expand All @@ -354,19 +354,25 @@ def _infer_aicore_arch(kernel_text: str, soc_version: str) -> str:
)
needs_cube = any(m in kernel_text for m in cube_markers)

sv = (soc_version or "").lower()
if "950" in sv or "a5" in sv:
# Ascend950 (A5) uses A5 instruction set. pto-isa examples build A5
# kernels with dav-c310-{vec|cube}.
arch = (pto_arch or "").strip().lower()
if arch == "a5":
# A5 uses A5 instruction set. pto-isa examples build A5 kernels with
# dav-c310-{vec|cube}.
return "dav-c310-cube" if needs_cube else "dav-c310-vec"
if "910b" in sv:
# Ascend910B* (e.g. Ascend910B1) uses dav-c310 toolchain arch.
return "dav-c310-cube" if needs_cube else "dav-c310-vec"

# Default to Ascend910 (dav-c220) when SoC is unknown.
if arch == "a3":
# A2/A3 uses dav-c220 toolchain arch.
return "dav-c220-cube" if needs_cube else "dav-c220-vec"
# Default to Ascend910 (dav-c220) when arch is unknown.
return "dav-c220-cube" if needs_cube else "dav-c220-vec"


def _soc_version_for_arch(arch: Optional[str]) -> str:
a = (arch or "").strip().lower()
if a == "a5":
return "Ascend910_9599"
return "Ascend910B1"


def _parse_int_list(blob: str):
items = []
for part in blob.split(","):
Expand Down Expand Up @@ -811,7 +817,7 @@ def generate_testcase(
output_root: Optional[Path],
testcase: str,
run_mode: str,
soc_version: str,
pto_arch: Optional[str] = None,
aicore_arch: Optional[str] = None,
):
sample_dir = input_cpp.parent
Expand All @@ -837,15 +843,15 @@ def generate_testcase(
# may be unavailable; build with a vector arch and explicitly enable the
# section macros instead.
if has_dav_cube or has_dav_vec:
sv = (soc_version or "").lower()
if "950" in sv or "a5" in sv:
aicore_arch = "dav-c310-vec"
elif "910b" in sv:
arch = (pto_arch or "").strip().lower()
if arch == "a5":
aicore_arch = "dav-c310-vec"
elif arch == "a3":
aicore_arch = "dav-c220-vec"
else:
aicore_arch = "dav-c220-vec"
else:
aicore_arch = _infer_aicore_arch(raw_kernel, soc_version)
aicore_arch = _infer_aicore_arch(raw_kernel, pto_arch)

# Force-define DAV section macros so both sections are compiled into the
# same binary. This keeps the generated validation executable self-contained
Expand Down Expand Up @@ -1195,10 +1201,10 @@ def generate_testcase(
(output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8")

# pto-isa selects instruction implementations based on MEMORY_BASE vs
# REGISTER_BASE. Ascend A5 (e.g. Ascend950) and Ascend910B use REGISTER_BASE.
# REGISTER_BASE. A5 uses REGISTER_BASE.
mem_base_define = "MEMORY_BASE"
sv = (soc_version or "").lower()
if "910b" in sv or "950" in sv or "a5" in sv:
arch = (pto_arch or "").strip().lower()
if arch == "a5":
mem_base_define = "REGISTER_BASE"

# CCE printing support is gated behind `--cce-enable-print` on some bisheng
Expand All @@ -1225,8 +1231,14 @@ def generate_testcase(
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if(NOT DEFINED SOC_VERSION)
set(SOC_VERSION Ascend910)
if(NOT DEFINED PTO_ARCH)
set(PTO_ARCH a3)
endif()
string(TOLOWER "${PTO_ARCH}" PTO_ARCH_LC)
if(PTO_ARCH_LC STREQUAL "a5")
set(SIM_SOC_DIR Ascend910_9599)
else()
set(SIM_SOC_DIR Ascend910B1)
endif()
option(ENABLE_SIM_GOLDEN "Build Ascend simulator (camodel) executable" ON)

Expand Down Expand Up @@ -1333,9 +1345,9 @@ def generate_testcase(
)
target_link_directories({testcase}_sim PUBLIC
${{ASCEND_HOME_PATH}}/lib64
${{ASCEND_HOME_PATH}}/aarch64-linux/simulator/${{SOC_VERSION}}/lib
${{ASCEND_HOME_PATH}}/simulator/${{SOC_VERSION}}/lib
${{ASCEND_HOME_PATH}}/tools/simulator/${{SOC_VERSION}}/lib
${{ASCEND_HOME_PATH}}/aarch64-linux/simulator/${{SIM_SOC_DIR}}/lib
${{ASCEND_HOME_PATH}}/simulator/${{SIM_SOC_DIR}}/lib
${{ASCEND_HOME_PATH}}/tools/simulator/${{SIM_SOC_DIR}}/lib
)
target_link_libraries({testcase}_sim PRIVATE
{testcase}_kernel
Expand Down Expand Up @@ -1390,10 +1402,11 @@ def generate_testcase(
encoding="utf-8",
)

arch_for_runsh = (pto_arch or "a3").strip().lower()
run_sh = (templates_root / "run_sh_template.sh").read_text(encoding="utf-8")
run_sh = run_sh.replace("@EXECUTABLE@", testcase)
run_sh = run_sh.replace("@RUN_MODE@", run_mode)
run_sh = run_sh.replace("@SOC_VERSION@", soc_version)
run_sh = run_sh.replace("@PTO_ARCH@", arch_for_runsh)
run_path = output_dir / "run.sh"
run_path.write_text(run_sh, encoding="utf-8")
run_path.chmod(0o755)
Expand All @@ -1405,7 +1418,7 @@ def main():
parser.add_argument("--testcase", default=None, help="Testcase name (default: derived from input filename)")
parser.add_argument("--output-root", default=None, help="Output testcases root directory")
parser.add_argument("--run-mode", default="npu", choices=["sim", "npu"], help="Run mode for run.sh")
parser.add_argument("--soc-version", default="Ascend910", help="SOC version for run.sh")
parser.add_argument("--pto-arch", default="a3", help="Target PTO arch (a3 or a5).")
parser.add_argument(
"--aicore-arch",
default=None,
Expand All @@ -1420,7 +1433,7 @@ def main():
output_root,
testcase,
args.run_mode,
args.soc_version,
pto_arch=args.pto_arch,
aicore_arch=args.aicore_arch,
)

Expand Down
35 changes: 17 additions & 18 deletions test/npu_validation/scripts/run_remote_npu_validation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ set -euo pipefail

STAGE="${STAGE:-run}" # build|run
RUN_MODE="${RUN_MODE:-npu}" # npu|sim
SOC_VERSION="${SOC_VERSION:-Ascend910}"
PTO_ARCH="${PTO_ARCH:-a3}"
GOLDEN_MODE="${GOLDEN_MODE:-npu}" # sim|npu|skip
PTO_ISA_REPO="${PTO_ISA_REPO:-https://github.com/PTO-ISA/pto-isa.git}"
PTO_ISA_COMMIT="${PTO_ISA_COMMIT:-}"
Expand All @@ -24,7 +24,7 @@ fi
log() { echo "[$(date +'%F %T')] $*"; }

log "=== Remote NPU Validation ==="
log "STAGE=${STAGE} RUN_MODE=${RUN_MODE} SOC_VERSION=${SOC_VERSION}"
log "STAGE=${STAGE} RUN_MODE=${RUN_MODE} PTO_ARCH=${PTO_ARCH}"
log "GOLDEN_MODE=${GOLDEN_MODE}"
log "DEVICE_ID=${DEVICE_ID}"
log "PTO_ISA_REPO=${PTO_ISA_REPO}"
Expand Down Expand Up @@ -121,24 +121,23 @@ fi

export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}"

# Some CANN installs do not provide a simulator directory named exactly
# "Ascend910". Map it to a real directory so we can link/run camodel.
SIM_SOC_VERSION="${SOC_VERSION}"
if [[ "${SOC_VERSION}" == "Ascend910" ]]; then
if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then
SIM_SOC_VERSION="Ascend910A"
elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then
SIM_SOC_VERSION="Ascend910ProA"
fi
fi
log "SIM_SOC_VERSION=${SIM_SOC_VERSION}"
pto_arch_lc="$(printf '%s' "${PTO_ARCH}" | tr '[:upper:]' '[:lower:]')"
case "${pto_arch_lc}" in
a5) SIM_SOC_DIR="Ascend910_9599" ;;
a3) SIM_SOC_DIR="Ascend910B1" ;;
*)
SIM_SOC_DIR="Ascend910B1"
pto_arch_lc="a3"
;;
esac
log "SIM_SOC_DIR=${SIM_SOC_DIR}"

LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH}"
LD_LIBRARY_PATH_SIM="${LD_LIBRARY_PATH}"
for d in \
"${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_VERSION}/lib" \
"${ASCEND_HOME_PATH}/simulator/${SIM_SOC_VERSION}/lib" \
"${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_VERSION}/lib"; do
"${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_DIR}/lib" \
"${ASCEND_HOME_PATH}/simulator/${SIM_SOC_DIR}/lib" \
"${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_DIR}/lib"; do
[[ -d "$d" ]] && LD_LIBRARY_PATH_SIM="$d:${LD_LIBRARY_PATH_SIM}"
done

Expand Down Expand Up @@ -216,7 +215,7 @@ while IFS= read -r -d '' cpp; do
--testcase "${testcase}" \
--output-root "${OUTPUT_ROOT}" \
--run-mode "${RUN_MODE}" \
--soc-version "${SIM_SOC_VERSION}"
--pto-arch "${PTO_ARCH}"
gen_rc=$?
set -euo pipefail
if [[ $gen_rc -ne 0 ]]; then
Expand All @@ -236,7 +235,7 @@ while IFS= read -r -d '' cpp; do
enable_sim_golden="OFF"
[[ "${GOLDEN_MODE}" == "sim" ]] && enable_sim_golden="ON"
cmake -S . -B ./build \
-DSOC_VERSION="${SIM_SOC_VERSION}" \
-DPTO_ARCH="${PTO_ARCH}" \
-DENABLE_SIM_GOLDEN="${enable_sim_golden}" \
-DPTO_ISA_ROOT="${PTO_ISA_ROOT}"
cmake --build ./build --parallel
Expand Down
29 changes: 14 additions & 15 deletions test/npu_validation/templates/run_sh_template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@
set -euo pipefail

RUN_MODE="@RUN_MODE@"
SOC_VERSION="@SOC_VERSION@"
PTO_ARCH="${PTO_ARCH:-@PTO_ARCH@}"
if [[ -z "${PTO_ARCH}" || "${PTO_ARCH}" == "@PTO_ARCH@" ]]; then
PTO_ARCH="a3"
fi

case "${PTO_ARCH,,}" in
a5) SIM_SOC_DIR="Ascend910_9599" ;;
*) SIM_SOC_DIR="Ascend910B1" ;;
esac
GOLDEN_MODE="${GOLDEN_MODE:-npu}" # sim|npu|skip
BUILD_DIR="${BUILD_DIR:-build}"

Expand Down Expand Up @@ -56,19 +64,10 @@ fi
LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH:-}"
LD_LIBRARY_PATH_SIM="${LD_LIBRARY_PATH_NPU}"
if [[ -n "${ASCEND_HOME_PATH:-}" ]]; then
SIM_SOC_VERSION="${SOC_VERSION}"
if [[ "${SOC_VERSION}" == "Ascend910" ]]; then
if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then
SIM_SOC_VERSION="Ascend910A"
elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then
SIM_SOC_VERSION="Ascend910ProA"
fi
fi

for d in \
"${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_VERSION}/lib" \
"${ASCEND_HOME_PATH}/simulator/${SIM_SOC_VERSION}/lib" \
"${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_VERSION}/lib"; do
"${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_DIR}/lib" \
"${ASCEND_HOME_PATH}/simulator/${SIM_SOC_DIR}/lib" \
"${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_DIR}/lib"; do
[[ -d "$d" ]] && LD_LIBRARY_PATH_SIM="$d:${LD_LIBRARY_PATH_SIM}"
done
fi
Expand All @@ -78,9 +77,9 @@ cd "${ROOT_DIR}/${BUILD_DIR}"
ENABLE_SIM_GOLDEN="OFF"
[[ "${GOLDEN_MODE}" == "sim" ]] && ENABLE_SIM_GOLDEN="ON"
if [[ -n "${PTO_ISA_ROOT:-}" ]]; then
cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" ..
cmake -DPTO_ARCH="${PTO_ARCH}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" ..
else
cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" ..
cmake -DPTO_ARCH="${PTO_ARCH}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" ..
fi
make -j

Expand Down
2 changes: 1 addition & 1 deletion test/samples/Bf16/bf16_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def build():
pto.register_dialect(ctx, load=True)

module = builtin.ModuleOp()
module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1")
module.attributes["pto.target_arch"] = StringAttr.get("a3")

bf16 = BF16Type.get()
ptr_bf16 = pto.PtrType.get(bf16)
Expand Down
2 changes: 1 addition & 1 deletion test/samples/MatMul/0.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @RunTMATMULSplitK(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: i1) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/MatMul/tmatmulk.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @RunTMATMULSplitK(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: i1) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/MatMul/tmatmulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def build(
pto.register_dialect(ctx, load=True)

module = builtin.ModuleOp()
module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1")
module.attributes["pto.target_arch"] = StringAttr.get("a3")

# ---- element types ----
t_out = F32Type.get()
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Matmul_transpose/Matmul_transpose-pto-ir.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @RunTEXTRACT(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: i1, %arg4: i1) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Matmul_transpose/Matmul_transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def build(
pto.register_dialect(ctx, load=True)

module = builtin.ModuleOp()
module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1")
module.attributes["pto.target_arch"] = StringAttr.get("a3")

t_out = F32Type.get()
t_a = F32Type.get()
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Sync/compensation_test.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @compensation_check(%arg0: !pto.ptr<f32>, %cond: i1) {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Sync/matmul.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @RunTMATMULSplitK(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: !pto.ptr<f32>, %arg3: !pto.ptr<f32>, %arg4: i1) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Sync/nested_loop_confliect.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @nested_loop_sync(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Sync/rar_optimization_test.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @rar_hazard_check(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>) {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Sync/test_if_else_tile_result.pto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module attributes {"pto.device-spec" = "Ascend910B1"} {
module attributes {"pto.target_arch" = "a3"} {
func.func @test_if_else_tile_result(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %arg2: i32, %arg3: !pto.ptr<f32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down
2 changes: 1 addition & 1 deletion test/samples/Sync/tmatmulk_autosync.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def build(
pto.register_dialect(ctx, load=True)

module = builtin.ModuleOp()
module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1")
module.attributes["pto.target_arch"] = StringAttr.get("a3")

# ---- element types ----
t_out = F32Type.get()
Expand Down
Loading
Loading