From 9b2188776e87613884d699002abcd82125767140 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 19:22:19 +0800 Subject: [PATCH 01/14] Use pto-arch a3 for Ascend910B handling --- README.md | 2 +- include/PTO/IR/PTOOps.td | 11 ++-- .../scripts/generate_testcase.py | 60 ++++++++++++------- .../scripts/run_remote_npu_validation.sh | 29 +++++---- test/samples/Bf16/bf16_tile.py | 2 +- test/samples/MatMul/0.pto | 2 +- test/samples/MatMul/tmatmulk.pto | 2 +- test/samples/MatMul/tmatmulk.py | 2 +- test/samples/Sync/compensation_test.pto | 2 +- test/samples/Sync/matmul.pto | 2 +- test/samples/Sync/nested_loop_confliect.pto | 2 +- test/samples/Sync/rar_optimization_test.pto | 2 +- test/samples/Sync/tmatmulk_autosync.py | 2 +- test/samples/Sync/tmatmulk_autosync_a5.py | 2 +- test/samples/runop.sh | 10 ++-- .../testdata/matmul_static_singlecore.pto | 2 +- 16 files changed, 76 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index b33dced9..fbe76dc3 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ python3 ./tmatmulk.py > ./tmatmulk.pto python3 test/npu_validation/scripts/generate_testcase.py \ --input test/samples/Abs/abs-pto.cpp \ --run-mode npu \ - --soc-version Ascend910B1 + --pto-arch a3 # 2) 运行验证(run.sh 无需额外参数) test/samples/Abs/npu_validation/run.sh diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index 267297e2..6d243352 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -3133,18 +3133,17 @@ def TScatterOp: PTO_TOp<"tscatter", [ let extraClassDeclaration = [{ ::mlir::pto::PIPE getPipe() { - // NOTE: On dav-c220 (Ascend910 A2/A3), pto-isa implements TSCATTER as a + // NOTE: On A2/A3 (--pto-arch=a3), pto-isa implements TSCATTER as a // scalar loop over UB pointers, which executes on the scalar pipeline // (PIPE_S). Waiting on PIPE_V does not block scalar UB accesses and can // lead to using uninitialized indices/data (crash / aivec exception). // - // On A5 instruction set devices, TSCATTER is implemented with vector - // scatter instructions and should be treated as PIPE_V. + // On A5 instruction set devices (--pto-arch=a5), TSCATTER is implemented + // with vector scatter instructions and should be treated as PIPE_V. auto moduleOp = getOperation()->getParentOfType<::mlir::ModuleOp>(); if (moduleOp) { - if (auto spec = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.device-spec")) { - auto s = spec.getValue(); - if (s.starts_with("Ascend950") || s.starts_with("Ascend910_95")) { + if (auto arch = moduleOp->getAttrOfType<::mlir::StringAttr>("pto.target_arch")) { + if (arch.getValue().equals_insensitive("a5")) { return ::mlir::pto::PIPE::PIPE_V; } } diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 217a4eaf..284efff6 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -332,7 +332,25 @@ def _inject_packed_pred_mask_preload( return kernel_text[:insert_at] + block + kernel_text[insert_at:] -def _infer_aicore_arch(kernel_text: str, soc_version: str) -> str: +def _normalize_pto_arch(pto_arch: Optional[str]) -> Optional[str]: + if not pto_arch: + return None + arch = pto_arch.strip().lower() + if arch.startswith("a3"): + return "a3" + if arch.startswith("a5"): + return "a5" + return None + + +def _default_soc_version(pto_arch: Optional[str]) -> str: + arch = _normalize_pto_arch(pto_arch) + if arch == "a5": + return "Ascend910_9599" + return "Ascend910B1" + + +def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: # Heuristic: kernels that touch cube/L0/L1 tile types or cbuf memories need # the "cube" arch; pure vector kernels can use the vector arch. # @@ -354,17 +372,16 @@ def _infer_aicore_arch(kernel_text: str, soc_version: str) -> str: ) needs_cube = any(m in kernel_text for m in cube_markers) - sv = (soc_version or "").lower() - if "950" in sv or "a5" in sv: - # Ascend950 (A5) uses A5 instruction set. pto-isa examples build A5 - # kernels with dav-c310-{vec|cube}. + arch = _normalize_pto_arch(pto_arch) or "a3" + if arch == "a5": + # A5 uses A5 instruction set. pto-isa examples build A5 kernels with + # dav-c310-{vec|cube}. return "dav-c310-cube" if needs_cube else "dav-c310-vec" - if "910b" in sv: - # Ascend910B* (e.g. Ascend910B1) uses dav-c310 toolchain arch. + if arch == "a3": + # A2/A3 uses dav-c310 toolchain arch. return "dav-c310-cube" if needs_cube else "dav-c310-vec" - - # Default to Ascend910 (dav-c220) when SoC is unknown. - return "dav-c220-cube" if needs_cube else "dav-c220-vec" + # Default to A2/A3 (dav-c310) when arch is unknown. + return "dav-c310-cube" if needs_cube else "dav-c310-vec" def _parse_int_list(blob: str): @@ -811,7 +828,7 @@ def generate_testcase( output_root: Optional[Path], testcase: str, run_mode: str, - soc_version: str, + pto_arch: Optional[str] = None, aicore_arch: Optional[str] = None, ): sample_dir = input_cpp.parent @@ -837,15 +854,13 @@ def generate_testcase( # may be unavailable; build with a vector arch and explicitly enable the # section macros instead. if has_dav_cube or has_dav_vec: - sv = (soc_version or "").lower() - if "950" in sv or "a5" in sv: - aicore_arch = "dav-c310-vec" - elif "910b" in sv: + arch = _normalize_pto_arch(pto_arch) or "a3" + if arch == "a5" or arch == "a3": aicore_arch = "dav-c310-vec" else: - aicore_arch = "dav-c220-vec" + aicore_arch = "dav-c310-vec" else: - aicore_arch = _infer_aicore_arch(raw_kernel, soc_version) + aicore_arch = _infer_aicore_arch(raw_kernel, pto_arch) # Force-define DAV section macros so both sections are compiled into the # same binary. This keeps the generated validation executable self-contained @@ -1195,10 +1210,10 @@ def generate_testcase( (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8") # pto-isa selects instruction implementations based on MEMORY_BASE vs - # REGISTER_BASE. Ascend A5 (e.g. Ascend950) and Ascend910B use REGISTER_BASE. + # REGISTER_BASE. A5 (e.g. Ascend950) and A2/A3 use REGISTER_BASE. mem_base_define = "MEMORY_BASE" - sv = (soc_version or "").lower() - if "910b" in sv or "950" in sv or "a5" in sv: + arch = _normalize_pto_arch(pto_arch) + if arch == "a3" or arch == "a5": mem_base_define = "REGISTER_BASE" # CCE printing support is gated behind `--cce-enable-print` on some bisheng @@ -1390,6 +1405,7 @@ def generate_testcase( encoding="utf-8", ) + soc_version = _default_soc_version(pto_arch) run_sh = (templates_root / "run_sh_template.sh").read_text(encoding="utf-8") run_sh = run_sh.replace("@EXECUTABLE@", testcase) run_sh = run_sh.replace("@RUN_MODE@", run_mode) @@ -1405,7 +1421,7 @@ def main(): parser.add_argument("--testcase", default=None, help="Testcase name (default: derived from input filename)") parser.add_argument("--output-root", default=None, help="Output testcases root directory") parser.add_argument("--run-mode", default="npu", choices=["sim", "npu"], help="Run mode for run.sh") - parser.add_argument("--soc-version", default="Ascend910", help="SOC version for run.sh") + parser.add_argument("--pto-arch", default=None, help="Target PTO arch (a3 or a5).") parser.add_argument( "--aicore-arch", default=None, @@ -1420,7 +1436,7 @@ def main(): output_root, testcase, args.run_mode, - args.soc_version, + pto_arch=args.pto_arch, aicore_arch=args.aicore_arch, ) diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 43f766dd..30ca54b3 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -3,7 +3,7 @@ set -euo pipefail STAGE="${STAGE:-run}" # build|run RUN_MODE="${RUN_MODE:-npu}" # npu|sim -SOC_VERSION="${SOC_VERSION:-Ascend910}" +PTO_ARCH="${PTO_ARCH:-a3}" GOLDEN_MODE="${GOLDEN_MODE:-npu}" # sim|npu|skip PTO_ISA_REPO="${PTO_ISA_REPO:-https://github.com/PTO-ISA/pto-isa.git}" PTO_ISA_COMMIT="${PTO_ISA_COMMIT:-}" @@ -24,7 +24,7 @@ fi log() { echo "[$(date +'%F %T')] $*"; } log "=== Remote NPU Validation ===" -log "STAGE=${STAGE} RUN_MODE=${RUN_MODE} SOC_VERSION=${SOC_VERSION}" +log "STAGE=${STAGE} RUN_MODE=${RUN_MODE} PTO_ARCH=${PTO_ARCH}" log "GOLDEN_MODE=${GOLDEN_MODE}" log "DEVICE_ID=${DEVICE_ID}" log "PTO_ISA_REPO=${PTO_ISA_REPO}" @@ -121,16 +121,17 @@ fi export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" -# Some CANN installs do not provide a simulator directory named exactly -# "Ascend910". Map it to a real directory so we can link/run camodel. +pto_arch_lc="$(printf '%s' "${PTO_ARCH}" | tr '[:upper:]' '[:lower:]')" +case "${pto_arch_lc}" in + a5) SOC_VERSION="Ascend910_9599" ;; + a3) SOC_VERSION="Ascend910B1" ;; + *) + SOC_VERSION="Ascend910B1" + pto_arch_lc="a3" + ;; +esac + SIM_SOC_VERSION="${SOC_VERSION}" -if [[ "${SOC_VERSION}" == "Ascend910" ]]; then - if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then - SIM_SOC_VERSION="Ascend910A" - elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then - SIM_SOC_VERSION="Ascend910ProA" - fi -fi log "SIM_SOC_VERSION=${SIM_SOC_VERSION}" LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH}" @@ -211,12 +212,16 @@ while IFS= read -r -d '' cpp; do nv_dir="${OUTPUT_ROOT}/${sample_name}/${testcase}" set +e + pto_arch_args=() + if [[ -n "${PTO_ARCH}" ]]; then + pto_arch_args+=(--pto-arch "${PTO_ARCH}") + fi python3 "${ROOT_DIR}/test/npu_validation/scripts/generate_testcase.py" \ --input "${cpp}" \ --testcase "${testcase}" \ --output-root "${OUTPUT_ROOT}" \ --run-mode "${RUN_MODE}" \ - --soc-version "${SIM_SOC_VERSION}" + "${pto_arch_args[@]}" gen_rc=$? set -euo pipefail if [[ $gen_rc -ne 0 ]]; then diff --git a/test/samples/Bf16/bf16_tile.py b/test/samples/Bf16/bf16_tile.py index a3962d2a..1be1ff43 100644 --- a/test/samples/Bf16/bf16_tile.py +++ b/test/samples/Bf16/bf16_tile.py @@ -18,7 +18,7 @@ def build(): pto.register_dialect(ctx, load=True) module = builtin.ModuleOp() - module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1") + module.attributes["pto.target_arch"] = StringAttr.get("a3") bf16 = BF16Type.get() ptr_bf16 = pto.PtrType.get(bf16) diff --git a/test/samples/MatMul/0.pto b/test/samples/MatMul/0.pto index 83d7c2ae..76db1894 100644 --- a/test/samples/MatMul/0.pto +++ b/test/samples/MatMul/0.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @RunTMATMULSplitK(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: i1) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/test/samples/MatMul/tmatmulk.pto b/test/samples/MatMul/tmatmulk.pto index 0ccaf8a6..905b911c 100644 --- a/test/samples/MatMul/tmatmulk.pto +++ b/test/samples/MatMul/tmatmulk.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @RunTMATMULSplitK(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: i1) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/test/samples/MatMul/tmatmulk.py b/test/samples/MatMul/tmatmulk.py index 0984c574..21a1d6a1 100644 --- a/test/samples/MatMul/tmatmulk.py +++ b/test/samples/MatMul/tmatmulk.py @@ -29,7 +29,7 @@ def build( pto.register_dialect(ctx, load=True) module = builtin.ModuleOp() - module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1") + module.attributes["pto.target_arch"] = StringAttr.get("a3") # ---- element types ---- t_out = F32Type.get() diff --git a/test/samples/Sync/compensation_test.pto b/test/samples/Sync/compensation_test.pto index 36f02bd3..0373ece3 100644 --- a/test/samples/Sync/compensation_test.pto +++ b/test/samples/Sync/compensation_test.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @compensation_check(%arg0: !pto.ptr, %cond: i1) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index diff --git a/test/samples/Sync/matmul.pto b/test/samples/Sync/matmul.pto index 35e040f9..3663ddc1 100644 --- a/test/samples/Sync/matmul.pto +++ b/test/samples/Sync/matmul.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @RunTMATMULSplitK(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: i1) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/test/samples/Sync/nested_loop_confliect.pto b/test/samples/Sync/nested_loop_confliect.pto index c35f9a67..3fafb1e1 100644 --- a/test/samples/Sync/nested_loop_confliect.pto +++ b/test/samples/Sync/nested_loop_confliect.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @nested_loop_sync(%arg0: !pto.ptr, %arg1: !pto.ptr) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/test/samples/Sync/rar_optimization_test.pto b/test/samples/Sync/rar_optimization_test.pto index 30585376..de4c7fb6 100644 --- a/test/samples/Sync/rar_optimization_test.pto +++ b/test/samples/Sync/rar_optimization_test.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @rar_hazard_check(%arg0: !pto.ptr, %arg1: !pto.ptr) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index diff --git a/test/samples/Sync/tmatmulk_autosync.py b/test/samples/Sync/tmatmulk_autosync.py index 1937fb62..22237c79 100644 --- a/test/samples/Sync/tmatmulk_autosync.py +++ b/test/samples/Sync/tmatmulk_autosync.py @@ -32,7 +32,7 @@ def build( pto.register_dialect(ctx, load=True) module = builtin.ModuleOp() - module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1") + module.attributes["pto.target_arch"] = StringAttr.get("a3") # ---- element types ---- t_out = F32Type.get() diff --git a/test/samples/Sync/tmatmulk_autosync_a5.py b/test/samples/Sync/tmatmulk_autosync_a5.py index 3ef27e76..15c2b9ad 100644 --- a/test/samples/Sync/tmatmulk_autosync_a5.py +++ b/test/samples/Sync/tmatmulk_autosync_a5.py @@ -32,7 +32,7 @@ def build( pto.register_dialect(ctx, load=True) module = builtin.ModuleOp() - module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1") + module.attributes["pto.target_arch"] = StringAttr.get("a5") # ---- element types ---- t_out = F32Type.get() diff --git a/test/samples/runop.sh b/test/samples/runop.sh index ee50e931..8e26dd7e 100755 --- a/test/samples/runop.sh +++ b/test/samples/runop.sh @@ -187,13 +187,11 @@ process_one_dir() { esac # A5-only sample: buffer-id synchronization ops lower to CCEC get_buf/rls_buf - # intrinsics, which are not supported on older SoCs (e.g. Ascend910(A3)). - # Skip this python sample unless SOC_VERSION indicates an A5 target. + # intrinsics, which are not supported on A2/A3 (--pto-arch=a3). + # Skip this python sample unless --pto-arch indicates an A5 target. if [[ "$base" == "test_a5_buf_sync" ]]; then - soc="${SOC_VERSION:-}" - soc_lc="$(printf '%s' "${soc}" | tr '[:upper:]' '[:lower:]')" - if [[ "$soc_lc" != *"a5"* && "$soc_lc" != *"950"* ]]; then - echo -e "${A}(${base}.py)\tSKIP\trequires A5 (set SOC_VERSION to A5/950)" + if [[ "$(printf '%s' "$target_arch" | tr '[:upper:]' '[:lower:]')" != "a5" ]]; then + echo -e "${A}(${base}.py)\tSKIP\trequires A5 (set --pto-arch=a5)" continue fi fi diff --git a/tools/ptobc/testdata/matmul_static_singlecore.pto b/tools/ptobc/testdata/matmul_static_singlecore.pto index f4838f53..c7209285 100644 --- a/tools/ptobc/testdata/matmul_static_singlecore.pto +++ b/tools/ptobc/testdata/matmul_static_singlecore.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @RunTMATMULSplitK(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: i1) { pto.section.cube { %c0 = arith.constant 0 : index From 2ca9d260c633b877fbfbbf778a0967941b49ca6c Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 20:36:26 +0800 Subject: [PATCH 02/14] Use pto-arch value in generated run.sh --- .../scripts/generate_testcase.py | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 284efff6..f81f26df 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -343,18 +343,11 @@ def _normalize_pto_arch(pto_arch: Optional[str]) -> Optional[str]: return None -def _default_soc_version(pto_arch: Optional[str]) -> str: - arch = _normalize_pto_arch(pto_arch) - if arch == "a5": - return "Ascend910_9599" - return "Ascend910B1" - - def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: # Heuristic: kernels that touch cube/L0/L1 tile types or cbuf memories need # the "cube" arch; pure vector kernels can use the vector arch. # - # IMPORTANT: the default arch depends on the Ascend SoC. + # IMPORTANT: the default arch depends on the target architecture. cube_markers = ( "TileType::Mat", "TileType::Left", @@ -372,7 +365,7 @@ def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: ) needs_cube = any(m in kernel_text for m in cube_markers) - arch = _normalize_pto_arch(pto_arch) or "a3" + arch = _normalize_pto_arch(pto_arch) if arch == "a5": # A5 uses A5 instruction set. pto-isa examples build A5 kernels with # dav-c310-{vec|cube}. @@ -380,8 +373,8 @@ def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: if arch == "a3": # A2/A3 uses dav-c310 toolchain arch. return "dav-c310-cube" if needs_cube else "dav-c310-vec" - # Default to A2/A3 (dav-c310) when arch is unknown. - return "dav-c310-cube" if needs_cube else "dav-c310-vec" + # Default to Ascend910 (dav-c220) when arch is unknown. + return "dav-c220-cube" if needs_cube else "dav-c220-vec" def _parse_int_list(blob: str): @@ -854,11 +847,11 @@ def generate_testcase( # may be unavailable; build with a vector arch and explicitly enable the # section macros instead. if has_dav_cube or has_dav_vec: - arch = _normalize_pto_arch(pto_arch) or "a3" + arch = _normalize_pto_arch(pto_arch) if arch == "a5" or arch == "a3": aicore_arch = "dav-c310-vec" else: - aicore_arch = "dav-c310-vec" + aicore_arch = "dav-c220-vec" else: aicore_arch = _infer_aicore_arch(raw_kernel, pto_arch) @@ -1405,11 +1398,11 @@ def generate_testcase( encoding="utf-8", ) - soc_version = _default_soc_version(pto_arch) + arch_for_runsh = _normalize_pto_arch(pto_arch) or (pto_arch or "Ascend910") run_sh = (templates_root / "run_sh_template.sh").read_text(encoding="utf-8") run_sh = run_sh.replace("@EXECUTABLE@", testcase) run_sh = run_sh.replace("@RUN_MODE@", run_mode) - run_sh = run_sh.replace("@SOC_VERSION@", soc_version) + run_sh = run_sh.replace("@SOC_VERSION@", arch_for_runsh) run_path = output_dir / "run.sh" run_path.write_text(run_sh, encoding="utf-8") run_path.chmod(0o755) From d4a060f1e453ca57784371b73f6d5a8e33268c69 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 20:46:28 +0800 Subject: [PATCH 03/14] Adjust a3/a5 arch mapping in testcase generator --- .../scripts/generate_testcase.py | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index f81f26df..9bcadb60 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -332,17 +332,6 @@ def _inject_packed_pred_mask_preload( return kernel_text[:insert_at] + block + kernel_text[insert_at:] -def _normalize_pto_arch(pto_arch: Optional[str]) -> Optional[str]: - if not pto_arch: - return None - arch = pto_arch.strip().lower() - if arch.startswith("a3"): - return "a3" - if arch.startswith("a5"): - return "a5" - return None - - def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: # Heuristic: kernels that touch cube/L0/L1 tile types or cbuf memories need # the "cube" arch; pure vector kernels can use the vector arch. @@ -365,14 +354,14 @@ def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: ) needs_cube = any(m in kernel_text for m in cube_markers) - arch = _normalize_pto_arch(pto_arch) + arch = (pto_arch or "").strip().lower() if arch == "a5": # A5 uses A5 instruction set. pto-isa examples build A5 kernels with # dav-c310-{vec|cube}. return "dav-c310-cube" if needs_cube else "dav-c310-vec" if arch == "a3": - # A2/A3 uses dav-c310 toolchain arch. - return "dav-c310-cube" if needs_cube else "dav-c310-vec" + # A2/A3 uses dav-c220 toolchain arch. + return "dav-c220-cube" if needs_cube else "dav-c220-vec" # Default to Ascend910 (dav-c220) when arch is unknown. return "dav-c220-cube" if needs_cube else "dav-c220-vec" @@ -847,9 +836,11 @@ def generate_testcase( # may be unavailable; build with a vector arch and explicitly enable the # section macros instead. if has_dav_cube or has_dav_vec: - arch = _normalize_pto_arch(pto_arch) - if arch == "a5" or arch == "a3": + arch = (pto_arch or "").strip().lower() + if arch == "a5": aicore_arch = "dav-c310-vec" + elif arch == "a3": + aicore_arch = "dav-c220-vec" else: aicore_arch = "dav-c220-vec" else: @@ -1203,10 +1194,10 @@ def generate_testcase( (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8") # pto-isa selects instruction implementations based on MEMORY_BASE vs - # REGISTER_BASE. A5 (e.g. Ascend950) and A2/A3 use REGISTER_BASE. + # REGISTER_BASE. A5 uses REGISTER_BASE. mem_base_define = "MEMORY_BASE" - arch = _normalize_pto_arch(pto_arch) - if arch == "a3" or arch == "a5": + arch = (pto_arch or "").strip().lower() + if arch == "a5": mem_base_define = "REGISTER_BASE" # CCE printing support is gated behind `--cce-enable-print` on some bisheng @@ -1398,7 +1389,7 @@ def generate_testcase( encoding="utf-8", ) - arch_for_runsh = _normalize_pto_arch(pto_arch) or (pto_arch or "Ascend910") + arch_for_runsh = pto_arch or "Ascend910" run_sh = (templates_root / "run_sh_template.sh").read_text(encoding="utf-8") run_sh = run_sh.replace("@EXECUTABLE@", testcase) run_sh = run_sh.replace("@RUN_MODE@", run_mode) From 077962d9b4c1d4b3076aaeffaf4d7f519db756f8 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 20:50:45 +0800 Subject: [PATCH 04/14] Default pto-arch to a3 in testcase generator --- test/npu_validation/scripts/generate_testcase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 9bcadb60..3ee008f0 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -1405,7 +1405,7 @@ def main(): parser.add_argument("--testcase", default=None, help="Testcase name (default: derived from input filename)") parser.add_argument("--output-root", default=None, help="Output testcases root directory") parser.add_argument("--run-mode", default="npu", choices=["sim", "npu"], help="Run mode for run.sh") - parser.add_argument("--pto-arch", default=None, help="Target PTO arch (a3 or a5).") + parser.add_argument("--pto-arch", default="a3", help="Target PTO arch (a3 or a5).") parser.add_argument( "--aicore-arch", default=None, From e7d443eca362d2f045aaae8bf56965927d97d9bf Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 20:59:18 +0800 Subject: [PATCH 05/14] Map PTO_ARCH to a3/a5 SOC_VERSION in remote validation --- test/npu_validation/scripts/run_remote_npu_validation.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 30ca54b3..92d3122c 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -123,10 +123,10 @@ export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" pto_arch_lc="$(printf '%s' "${PTO_ARCH}" | tr '[:upper:]' '[:lower:]')" case "${pto_arch_lc}" in - a5) SOC_VERSION="Ascend910_9599" ;; - a3) SOC_VERSION="Ascend910B1" ;; + a5) SOC_VERSION="a5" ;; + a3) SOC_VERSION="a3" ;; *) - SOC_VERSION="Ascend910B1" + SOC_VERSION="a3" pto_arch_lc="a3" ;; esac From 2cee1e7f19221a90e3d92dfb628d29911657e161 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 21:03:53 +0800 Subject: [PATCH 06/14] Drop planmemory changes --- lib/PTO/Transforms/PTOPlanMemory.cpp | 163 +++++++------- lib/PTO/Transforms/PTOPlanMemory.h | 3 - .../plan_memory_bind_tile_alias_liveness.mlir | 35 --- .../plan_memory_for_iter_args_yield.mlir | 34 --- .../plan_memory_fragmentation_hole_fit.mlir | 151 ------------- .../plan_memory_fragmentation_two_holes.mlir | 162 -------------- test/basic/plan_memory_if_in_loop.mlir | 37 ---- test/basic/plan_memory_if_yield.mlir | 33 --- test/basic/plan_memory_loop_in_if.mlir | 37 ---- .../plan_memory_loop_no_reuse_outer_live.mlir | 40 ---- test/basic/plan_memory_nested_loops.mlir | 45 ---- test/basic/plan_memory_no_reuse_overlap.mlir | 30 --- .../basic/plan_memory_peak_8_overlapping.mlir | 59 ----- .../plan_memory_peak_exact_capacity.mlir | 141 ------------ test/basic/plan_memory_reuse_sequential.mlir | 202 ------------------ .../plan_memory_bind_tile_alias_liveness.py | 27 --- .../plan_memory_for_iter_args_yield.py | 32 --- .../plan_memory_fragmentation_hole_fit.py | 149 ------------- .../plan_memory_fragmentation_two_holes.py | 157 -------------- .../planmemory/plan_memory_if_in_loop.py | 32 --- .../planmemory/plan_memory_if_yield.py | 29 --- .../planmemory/plan_memory_loop_in_if.py | 32 --- .../plan_memory_loop_no_reuse_outer_live.py | 36 ---- .../planmemory/plan_memory_nested_loops.py | 40 ---- .../plan_memory_no_reuse_overlap.py | 27 --- .../plan_memory_peak_8_overlapping.py | 57 ----- .../plan_memory_peak_exact_capacity.py | 139 ------------ .../plan_memory_reuse_sequential.py | 200 ----------------- 28 files changed, 75 insertions(+), 2054 deletions(-) delete mode 100644 test/basic/plan_memory_bind_tile_alias_liveness.mlir delete mode 100644 test/basic/plan_memory_for_iter_args_yield.mlir delete mode 100644 test/basic/plan_memory_fragmentation_hole_fit.mlir delete mode 100644 test/basic/plan_memory_fragmentation_two_holes.mlir delete mode 100644 test/basic/plan_memory_if_in_loop.mlir delete mode 100644 test/basic/plan_memory_if_yield.mlir delete mode 100644 test/basic/plan_memory_loop_in_if.mlir delete mode 100644 test/basic/plan_memory_loop_no_reuse_outer_live.mlir delete mode 100644 test/basic/plan_memory_nested_loops.mlir delete mode 100644 test/basic/plan_memory_no_reuse_overlap.mlir delete mode 100644 test/basic/plan_memory_peak_8_overlapping.mlir delete mode 100644 test/basic/plan_memory_peak_exact_capacity.mlir delete mode 100644 test/basic/plan_memory_reuse_sequential.mlir delete mode 100644 test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py delete mode 100644 test/samples/planmemory/plan_memory_for_iter_args_yield.py delete mode 100644 test/samples/planmemory/plan_memory_fragmentation_hole_fit.py delete mode 100644 test/samples/planmemory/plan_memory_fragmentation_two_holes.py delete mode 100644 test/samples/planmemory/plan_memory_if_in_loop.py delete mode 100644 test/samples/planmemory/plan_memory_if_yield.py delete mode 100644 test/samples/planmemory/plan_memory_loop_in_if.py delete mode 100644 test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py delete mode 100644 test/samples/planmemory/plan_memory_nested_loops.py delete mode 100644 test/samples/planmemory/plan_memory_no_reuse_overlap.py delete mode 100644 test/samples/planmemory/plan_memory_peak_8_overlapping.py delete mode 100644 test/samples/planmemory/plan_memory_peak_exact_capacity.py delete mode 100644 test/samples/planmemory/plan_memory_reuse_sequential.py diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp index 69569043..24d24171 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.cpp +++ b/lib/PTO/Transforms/PTOPlanMemory.cpp @@ -696,37 +696,6 @@ void MemPlan::EmitPlanMemoryFailureInfo() { } } -bool MemPlan::RecordOverflowIfAny() { - if (!failApplyBufferInfo.empty()) { - return true; - } - if (planMode != MemPlanMode::LOCAL_MEM_PLAN || - memscope2rootStorageEntry.empty()) { - return false; - } - - for (auto &it : memscope2rootStorageEntry) { - auto *rootStorageEntry = it.second; - if (!rootStorageEntry) { - continue; - } - auto bufferSpaceInfo = - GetBufferSpaceInfo(rootStorageEntry->bufInfo->bufferScope); - size_t maxBits = bufferSpaceInfo.second; - uint64_t maxAllocBits = rootStorageEntry->alignedConstBits; - for (auto *child : rootStorageEntry->mergedChildren) { - maxAllocBits = - std::max(maxAllocBits, child->bitsOffset + child->alignedConstBits); - } - if (maxAllocBits > maxBits) { - failApplyBufferInfo[rootStorageEntry->bufInfo->bufferScope] = - maxAllocBits; - } - } - - return !failApplyBufferInfo.empty(); -} - // Plan Memory algorithm. LogicalResult MemPlan::plan() { // Construct StorageEntry structure. @@ -739,10 +708,6 @@ LogicalResult MemPlan::plan() { EmitPlanMemoryFailureInfo(); return failure(); } - if (RecordOverflowIfAny()) { - EmitPlanMemoryFailureInfo(); - return failure(); - } // Update the address information of each buffer after memory buffer. UpdateBuffer2Offsets(); if (enablePrintMemoryAllocatedSize) { @@ -1788,66 +1753,88 @@ void MemPlan::ReportAllocatedEntryDebugInfo(StorageEntry *rootStorageEntry) { } LogicalResult MemPlan::InitMemSpecsFromModule(func::FuncOp funcOp) { - struct MemSpec { - int ubSpaceSize; - int l1SpaceSize; - int l0aSpaceSize; - int l0bSpaceSize; - int l0cSpaceSize; - int ubAlignSize; - int l1AlignSize; - int l0cAlignSize; - int l0aAlignSize; - int l0bAlignSize; - int biasAlignSize; - int biasSpaceSize; - int scalingAlignSize; - int scalingSpaceSize; - }; - - const MemSpec kA3 = { - 1572864, 4194304, 524288, 524288, 1048576, 256, 256, - 4096, 4096, 4096, 256, 524288, 256, 1572864}; - const MemSpec kA5 = { - 2031616, 4194304, 524288, 524288, 2097152, 256, 256, - 4096, 4096, 4096, 256, 524288, 256, 2031616}; - - auto applySpec = [this](const MemSpec &spec) { - ubSpaceSize = spec.ubSpaceSize; - l1SpaceSize = spec.l1SpaceSize; - l0aSpaceSize = spec.l0aSpaceSize; - l0bSpaceSize = spec.l0bSpaceSize; - l0cSpaceSize = spec.l0cSpaceSize; - ubAlignSize = spec.ubAlignSize; - l1AlignSize = spec.l1AlignSize; - l0cAlignSize = spec.l0cAlignSize; - l0aAlignSize = spec.l0aAlignSize; - l0bAlignSize = spec.l0bAlignSize; - biasAlignSize = spec.biasAlignSize; - biasSpaceSize = spec.biasSpaceSize; - scalingAlignSize = spec.scalingAlignSize; - scalingSpaceSize = spec.scalingSpaceSize; - }; - - // Default to a3. - applySpec(kA3); + ubSpaceSize = 1572864; + l1SpaceSize = 4194304; + l0aSpaceSize = 524288; + l0bSpaceSize = 524288; + l0cSpaceSize = 1048576; + ubAlignSize = 256; + l1AlignSize = 256; + l0cAlignSize = 4096; + l0aAlignSize = 4096; + l0bAlignSize = 4096; + biasAlignSize = 256; + biasSpaceSize = 524288; + scalingAlignSize = 256; + scalingSpaceSize = 1572864; auto moduleOp = getTopLevelModuleOp(funcOp); - StringAttr archAttr = moduleOp->getAttrOfType("pto.target_arch"); - if (!archAttr) { + StringAttr strAttr = moduleOp->getAttrOfType("pto.device-spec"); + if (!strAttr) { return success(); } - std::string arch = archAttr.getValue().str(); - for (char &c : arch) - c = static_cast(std::tolower(static_cast(c))); + if (strAttr.getValue().str() == "Ascend910B1" || + strAttr.getValue().str() == "Ascend910B2" || + strAttr.getValue().str() == "Ascend910B3" || + strAttr.getValue().str() == "Ascend910B4" || + strAttr.getValue().str() == "Ascend910_9362" || + strAttr.getValue().str() == "Ascend910_9372" || + strAttr.getValue().str() == "Ascend910_9381" || + strAttr.getValue().str() == "Ascend910_9382" || + strAttr.getValue().str() == "Ascend910_9391" || + strAttr.getValue().str() == "Ascend910_9392") { + return success(); + } - // --pto-arch options: - // a3 -> default memory spec - // a5 -> override memory spec - if (arch == "a5") { - applySpec(kA5); + if (strAttr.getValue().str() == "Ascend310B1" || + strAttr.getValue().str() == "Ascend310B2" || + strAttr.getValue().str() == "Ascend310B3" || + strAttr.getValue().str() == "Ascend310B4") { + ubSpaceSize = 2097152; + l1SpaceSize = 8388608; + l0aSpaceSize = 524288; + l0bSpaceSize = 524288; + l0cSpaceSize = 1048576; + ubAlignSize = 256; + l1AlignSize = 256; + l0cAlignSize = 4096; + l0aAlignSize = 4096; + l0bAlignSize = 4096; + biasAlignSize = 256; + biasSpaceSize = 524288; + scalingAlignSize = 256; + scalingSpaceSize = 2097152; + return success(); } + + if (strAttr.getValue().str() == "Ascend910_950z" || + strAttr.getValue().str() == "Ascend910_9579" || + strAttr.getValue().str() == "Ascend910_957b" || + strAttr.getValue().str() == "Ascend910_957d" || + strAttr.getValue().str() == "Ascend910_950z" || + strAttr.getValue().str() == "Ascend910_9581" || + strAttr.getValue().str() == "Ascend910_9589" || + strAttr.getValue().str() == "Ascend910_958a" || + strAttr.getValue().str() == "Ascend910_958b" || + strAttr.getValue().str() == "Ascend910_9599") { + ubSpaceSize = 2031616; + l1SpaceSize = 4194304; + l0aSpaceSize = 524288; + l0bSpaceSize = 524288; + l0cSpaceSize = 2097152; + ubAlignSize = 256; + l1AlignSize = 256; + l0cAlignSize = 4096; + l0aAlignSize = 4096; + l0bAlignSize = 4096; + biasAlignSize = 256; + biasSpaceSize = 524288; + scalingAlignSize = 256; + scalingSpaceSize = 2031616; + return success(); + } + return success(); } diff --git a/lib/PTO/Transforms/PTOPlanMemory.h b/lib/PTO/Transforms/PTOPlanMemory.h index 6089087c..e7cff4f7 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.h +++ b/lib/PTO/Transforms/PTOPlanMemory.h @@ -466,9 +466,6 @@ class MemPlan { /// Print successful memory alloc. void PrintSuccessfulAllocatedMaxBits(); - /// Post-plan sanity check for local memory overflow. - bool RecordOverflowIfAny(); - /// Prepare the memref.alloc plan. PlanStatus PlanLocalMemAddress(); diff --git a/test/basic/plan_memory_bind_tile_alias_liveness.mlir b/test/basic/plan_memory_bind_tile_alias_liveness.mlir deleted file mode 100644 index d58be111..00000000 --- a/test/basic/plan_memory_bind_tile_alias_liveness.mlir +++ /dev/null @@ -1,35 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @bind_tile_alias_liveness(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %c16 = arith.constant 16 : index - - %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %av = pto.bind_tile %a, %c16, %c16 - {config = #pto.tile_buf_config} - : memref<16x16x16xf16, #pto.address_space> -> memref<16x16x16xf16, #pto.address_space> - - %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%b : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - // Using %av should keep %a live; %b must not reuse %a's offset. - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%av : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%av : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @bind_tile_alias_liveness -// CHECK-NOT: memref.alloc -// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 -// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 -// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%c8192_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> - diff --git a/test/basic/plan_memory_for_iter_args_yield.mlir b/test/basic/plan_memory_for_iter_args_yield.mlir deleted file mode 100644 index c6254e46..00000000 --- a/test/basic/plan_memory_for_iter_args_yield.mlir +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @for_iter_args_yield(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - scf.for %i = %c0 to %c2 step %c1 { - // Two allocs inside the loop body exercise liveness within loops and - // per-iteration memory planning for overlapping lifetimes. - %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%a : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%b : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%a : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @for_iter_args_yield -// CHECK-NOT: memref.alloc -// CHECK: scf.for -// After matching the loop header, require at least one planned buffer inside -// the loop body (the inner alloc becomes a pointer_cast). -// CHECK: pto.pointer_cast -// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_fragmentation_hole_fit.mlir b/test/basic/plan_memory_fragmentation_hole_fit.mlir deleted file mode 100644 index 7b9a5690..00000000 --- a/test/basic/plan_memory_fragmentation_hole_fit.mlir +++ /dev/null @@ -1,151 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @fragmentation_hole_fit(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - // Force a tight situation: - // - Keep 23 UB buffers live across the function (23 * 8192 = 188416 B). - // - Only 1 UB slot remains. Two short-lived buffers must reuse that slot. - %k0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k22 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - - // Touch all k-buffers early so their lifetimes start before the temps. - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k0 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k1 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k2 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k3 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k4 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k5 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k6 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k7 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k8 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k9 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k10 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k11 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k12 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k13 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k14 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k15 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k16 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k17 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k18 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k19 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k20 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k21 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k22 : memref<16x16x16xf16, #pto.address_space>) - - %t0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%t0 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%t0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %t1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%t1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%t1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - // Keep all k-buffers live until the end. - pto.tstore ins(%k0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k2 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k3 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k4 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k5 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k6 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k7 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k8 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k9 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k10 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k11 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k12 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k13 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k14 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k15 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k16 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k17 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k18 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k19 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k20 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k21 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k22 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @fragmentation_hole_fit -// CHECK-NOT: memref.alloc -// With 23 live UB buffers, there is exactly one remaining 8192B slot. The two -// short-lived buffers must reuse that slot (offset 23*8192 = 188416). -// CHECK-DAG: %[[O188416:.*]] = arith.constant 188416 : i64 -// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/basic/plan_memory_fragmentation_two_holes.mlir b/test/basic/plan_memory_fragmentation_two_holes.mlir deleted file mode 100644 index f1a817d9..00000000 --- a/test/basic/plan_memory_fragmentation_two_holes.mlir +++ /dev/null @@ -1,162 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @fragmentation_two_holes(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - // Tight + overlap: - // - Keep 22 UB buffers live (22 * 8192 = 180224 B), leaving 2 free slots. - // - Allocate 2 short-lived buffers with overlapping lifetimes twice. - // The two free slots are at offsets 22*8192 and 23*8192. - %k0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %k21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - - // Touch all k-buffers early so their lifetimes start before the temps. - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k0 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k1 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k2 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k3 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k4 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k5 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k6 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k7 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k8 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k9 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k10 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k11 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k12 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k13 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k14 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k15 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k16 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k17 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k18 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k19 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k20 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%k21 : memref<16x16x16xf16, #pto.address_space>) - - // Stage 1: two overlapping temps. - %a0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %b0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%a0 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%b0 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%a0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%b0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - // Stage 2: two overlapping temps again, which should reuse the same two slots. - %a1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %b1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%a1 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%b1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%a1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%b1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - // Keep all k-buffers live until the end. - pto.tstore ins(%k0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k2 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k3 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k4 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k5 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k6 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k7 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k8 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k9 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k10 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k11 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k12 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k13 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k14 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k15 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k16 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k17 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k18 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k19 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k20 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%k21 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @fragmentation_two_holes -// CHECK-NOT: memref.alloc -// With 22 live UB buffers, there are exactly two remaining 8192B slots at -// offsets 22*8192 and 23*8192, reused across the two stages. -// CHECK-DAG: %[[O180224:.*]] = arith.constant 180224 : i64 -// CHECK-DAG: %[[O188416:.*]] = arith.constant 188416 : i64 -// CHECK-DAG: pto.pointer_cast(%[[O180224]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%[[O180224]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/basic/plan_memory_if_in_loop.mlir b/test/basic/plan_memory_if_in_loop.mlir deleted file mode 100644 index accc16a1..00000000 --- a/test/basic/plan_memory_if_in_loop.mlir +++ /dev/null @@ -1,37 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @if_in_loop(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - - scf.for %i = %c0 to %c2 step %c1 { - %is0 = arith.cmpi eq, %i, %c0 : index - scf.if %is0 { - %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%a : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%a : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } else { - %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%b : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - } - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @if_in_loop -// CHECK-NOT: memref.alloc -// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 -// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 -// CHECK: scf.for -// CHECK: scf.if -// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_if_yield.mlir b/test/basic/plan_memory_if_yield.mlir deleted file mode 100644 index bd9b48b2..00000000 --- a/test/basic/plan_memory_if_yield.mlir +++ /dev/null @@ -1,33 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @if_yield(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %cond = arith.constant true - // Use scf.if control-flow without yielding a memref (the current emitc - // lowering can't handle memref-typed scf.if results), but still ensure - // PlanMemory rewrites allocs inside both branches. - scf.if %cond { - %then = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%then : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%then : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } else { - %els = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%els : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%els : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @if_yield -// CHECK-NOT: memref.alloc -// CHECK: scf.if -// CHECK: pto.pointer_cast -// CHECK: } else { -// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_loop_in_if.mlir b/test/basic/plan_memory_loop_in_if.mlir deleted file mode 100644 index 31d2a970..00000000 --- a/test/basic/plan_memory_loop_in_if.mlir +++ /dev/null @@ -1,37 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @loop_in_if(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %true = arith.constant true - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - - scf.if %true { - scf.for %i = %c0 to %c2 step %c1 { - %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%a : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%a : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - } else { - %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%b : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @loop_in_if -// CHECK-NOT: memref.alloc -// CHECK: scf.if -// CHECK: scf.for -// CHECK: } else { -// CHECK: pto.pointer_cast - diff --git a/test/basic/plan_memory_loop_no_reuse_outer_live.mlir b/test/basic/plan_memory_loop_no_reuse_outer_live.mlir deleted file mode 100644 index f08bc9f5..00000000 --- a/test/basic/plan_memory_loop_no_reuse_outer_live.mlir +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @loop_outer_live(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - - // A buffer that remains live across the loop (used after the loop). - %outer = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%outer : memref<16x16x16xf16, #pto.address_space>) - - // A loop-local buffer used inside the loop. - %inner = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - scf.for %i = %c0 to %c4 step %c1 { - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%inner : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%inner : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - - // Use %outer after the loop to keep it live across the loop. - pto.tstore ins(%outer : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @loop_outer_live -// CHECK-NOT: memref.alloc -// Expect a loop, and two planned buffers at distinct offsets. -// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 -// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 -// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%c8192_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK: scf.for diff --git a/test/basic/plan_memory_nested_loops.mlir b/test/basic/plan_memory_nested_loops.mlir deleted file mode 100644 index 0301039a..00000000 --- a/test/basic/plan_memory_nested_loops.mlir +++ /dev/null @@ -1,45 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @nested_loops(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - - // Outer buffer stays live across both loops (used after). - %outer = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%outer : memref<16x16x16xf16, #pto.address_space>) - - scf.for %i = %c0 to %c2 step %c1 { - // Buffer allocated inside the outer loop. - %mid = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%mid : memref<16x16x16xf16, #pto.address_space>) - scf.for %j = %c0 to %c2 step %c1 { - // A temp buffer in the inner loop. - %tmp = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%tmp : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%tmp : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - pto.tstore ins(%mid : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - } - - pto.tstore ins(%outer : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @nested_loops -// CHECK-NOT: memref.alloc -// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 -// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 -// CHECK: scf.for -// CHECK: scf.for -// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_no_reuse_overlap.mlir b/test/basic/plan_memory_no_reuse_overlap.mlir deleted file mode 100644 index b7a6e0d9..00000000 --- a/test/basic/plan_memory_no_reuse_overlap.mlir +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @no_reuse_overlap(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %ub0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %ub1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - - // Make lifetimes overlap by using both buffers after both are created. - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub0 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @no_reuse_overlap -// CHECK-NOT: memref.alloc -// With overlapping lifetimes, offsets must differ. -// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 -// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 -// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%c8192_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/basic/plan_memory_peak_8_overlapping.mlir b/test/basic/plan_memory_peak_8_overlapping.mlir deleted file mode 100644 index b22b54fb..00000000 --- a/test/basic/plan_memory_peak_8_overlapping.mlir +++ /dev/null @@ -1,59 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @peak_8_overlapping(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - // Peak liveness: 8 buffers live at once. - %u0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u0 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u1 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u2 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u3 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u4 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u5 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u6 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u7 : memref<16x16x16xf16, #pto.address_space>) - - pto.tstore ins(%u0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u2 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u3 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u4 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u5 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u6 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u7 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @peak_8_overlapping -// CHECK-NOT: memref.alloc -// 8 live buffers implies a max offset of 7*8192 = 57344 bytes. -// CHECK: %[[O57344:.*]] = arith.constant 57344 : i64 -// CHECK: pto.pointer_cast(%[[O57344]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> - diff --git a/test/basic/plan_memory_peak_exact_capacity.mlir b/test/basic/plan_memory_peak_exact_capacity.mlir deleted file mode 100644 index eaef6d40..00000000 --- a/test/basic/plan_memory_peak_exact_capacity.mlir +++ /dev/null @@ -1,141 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @peak_exact_capacity(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - // Default UB size is 1572864 bits (196608 bytes). Each buffer here is - // 16*16*16*f16 = 8192 bytes. 24 buffers live at once should fit exactly: - // 24 * 8192 = 196608 bytes. - %u0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u22 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %u23 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u0 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u1 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u2 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u3 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u4 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u5 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u6 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u7 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u8 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u9 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u10 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u11 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u12 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u13 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u14 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u15 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u16 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u17 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u18 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u19 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u20 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u21 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u22 : memref<16x16x16xf16, #pto.address_space>) - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%u23 : memref<16x16x16xf16, #pto.address_space>) - - pto.tstore ins(%u0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u2 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u3 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u4 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u5 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u6 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u7 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u8 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u9 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u10 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u11 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u12 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u13 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u14 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u15 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u16 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u17 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u18 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u19 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u20 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u21 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u22 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%u23 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @peak_exact_capacity -// CHECK-NOT: memref.alloc -// 24 live buffers implies a max offset of 23*8192 = 188416 bytes. -// CHECK: %[[O188416:.*]] = arith.constant 188416 : i64 -// CHECK: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> - diff --git a/test/basic/plan_memory_reuse_sequential.mlir b/test/basic/plan_memory_reuse_sequential.mlir deleted file mode 100644 index 20ff62ab..00000000 --- a/test/basic/plan_memory_reuse_sequential.mlir +++ /dev/null @@ -1,202 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @reuse_sequential(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - // Force reuse: - // UB capacity (default) is 1572864 bits (196608 bytes). Each buffer here is - // 16*16*16*f16 = 8192 bytes. Allocating 30 such buffers exceeds UB capacity - // unless memory reuse is applied. - %ub0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub0 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub0 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub2 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub2 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub3 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub3 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub4 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub4 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub5 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub5 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub6 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub6 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub7 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub7 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub8 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub8 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub9 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub9 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub10 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub10 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub11 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub11 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub12 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub12 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub13 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub13 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub14 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub14 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub15 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub15 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub16 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub16 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub17 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub17 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub18 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub18 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub19 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub19 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub20 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub20 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub21 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub21 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub22 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub22 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub22 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub23 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub23 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub23 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub24 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub24 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub24 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub25 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub25 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub25 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub26 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub26 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub26 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub27 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub27 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub27 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub28 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub28 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub28 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - %ub29 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub29 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub29 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - return - } -} - -// Anchor checks after the PlanMemory debug marker (ptoas prints the module -// before and after planning). -// CHECK: end PTO plan Mem! -// CHECK: func.func @reuse_sequential -// CHECK-NOT: memref.alloc -// Expect at least two distinct allocations to reuse offset 0. -// CHECK: %c0_i64 = arith.constant 0 : i64 -// CHECK: %[[BUF0:.*]] = pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK: %[[BUF1:.*]] = pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py b/test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py deleted file mode 100644 index fd92e85a..00000000 --- a/test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py +++ /dev/null @@ -1,27 +0,0 @@ -PTO_IR = r""" - -module { - func.func @bind_tile_alias_liveness(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %c16 = arith.constant 16 : index - - %a = pto.alloc_tile : !pto.tile_buf - - %b = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%b : !pto.tile_buf) - pto.tstore ins(%b : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - // Using %a should keep %a live; %b must not reuse %a's offset. - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%a : !pto.tile_buf) - pto.tstore ins(%a : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - return - } -} -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_for_iter_args_yield.py b/test/samples/planmemory/plan_memory_for_iter_args_yield.py deleted file mode 100644 index f4d9d7f7..00000000 --- a/test/samples/planmemory/plan_memory_for_iter_args_yield.py +++ /dev/null @@ -1,32 +0,0 @@ -PTO_IR = r""" - -module { - func.func @for_iter_args_yield(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - scf.for %i = %c0 to %c2 step %c1 { - // Two allocs inside the loop body exercise liveness within loops and - // per-iteration memory planning for overlapping lifetimes. - %a = pto.alloc_tile : !pto.tile_buf - %b = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%a : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%b : !pto.tile_buf) - pto.tstore ins(%a : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%b : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - return - } -} - -// After matching the loop header, require at least one planned buffer inside -// the loop body (the inner alloc becomes a pointer_cast). -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_fragmentation_hole_fit.py b/test/samples/planmemory/plan_memory_fragmentation_hole_fit.py deleted file mode 100644 index 1447ed2f..00000000 --- a/test/samples/planmemory/plan_memory_fragmentation_hole_fit.py +++ /dev/null @@ -1,149 +0,0 @@ -PTO_IR = r""" - -module { - func.func @fragmentation_hole_fit(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - // Force a tight situation: - // - Keep 23 UB buffers live across the function (23 * 8192 = 188416 B). - // - Only 1 UB slot remains. Two short-lived buffers must reuse that slot. - %k0 = pto.alloc_tile : !pto.tile_buf - %k1 = pto.alloc_tile : !pto.tile_buf - %k2 = pto.alloc_tile : !pto.tile_buf - %k3 = pto.alloc_tile : !pto.tile_buf - %k4 = pto.alloc_tile : !pto.tile_buf - %k5 = pto.alloc_tile : !pto.tile_buf - %k6 = pto.alloc_tile : !pto.tile_buf - %k7 = pto.alloc_tile : !pto.tile_buf - %k8 = pto.alloc_tile : !pto.tile_buf - %k9 = pto.alloc_tile : !pto.tile_buf - %k10 = pto.alloc_tile : !pto.tile_buf - %k11 = pto.alloc_tile : !pto.tile_buf - %k12 = pto.alloc_tile : !pto.tile_buf - %k13 = pto.alloc_tile : !pto.tile_buf - %k14 = pto.alloc_tile : !pto.tile_buf - %k15 = pto.alloc_tile : !pto.tile_buf - %k16 = pto.alloc_tile : !pto.tile_buf - %k17 = pto.alloc_tile : !pto.tile_buf - %k18 = pto.alloc_tile : !pto.tile_buf - %k19 = pto.alloc_tile : !pto.tile_buf - %k20 = pto.alloc_tile : !pto.tile_buf - %k21 = pto.alloc_tile : !pto.tile_buf - %k22 = pto.alloc_tile : !pto.tile_buf - - // Touch all k-buffers early so their lifetimes start before the temps. - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k0 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k1 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k2 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k3 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k4 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k5 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k6 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k7 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k8 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k9 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k10 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k11 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k12 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k13 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k14 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k15 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k16 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k17 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k18 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k19 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k20 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k21 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k22 : !pto.tile_buf) - - %t0 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%t0 : !pto.tile_buf) - pto.tstore ins(%t0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %t1 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%t1 : !pto.tile_buf) - pto.tstore ins(%t1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - // Keep all k-buffers live until the end. - pto.tstore ins(%k0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k2 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k3 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k4 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k5 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k6 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k7 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k8 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k9 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k10 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k11 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k12 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k13 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k14 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k15 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k16 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k17 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k18 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k19 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k20 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k21 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k22 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - return - } -} - -// With 23 live UB buffers, there is exactly one remaining 8192B slot. The two -// short-lived buffers must reuse that slot (offset 23*8192 = 188416). -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_fragmentation_two_holes.py b/test/samples/planmemory/plan_memory_fragmentation_two_holes.py deleted file mode 100644 index cf360b0a..00000000 --- a/test/samples/planmemory/plan_memory_fragmentation_two_holes.py +++ /dev/null @@ -1,157 +0,0 @@ -PTO_IR = r""" - -module { - func.func @fragmentation_two_holes(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - // Tight + overlap: - // - Keep 22 UB buffers live (22 * 8192 = 180224 B), leaving 2 free slots. - // - Allocate 2 short-lived buffers with overlapping lifetimes twice. - // The two free slots are at offsets 22*8192 and 23*8192. - %k0 = pto.alloc_tile : !pto.tile_buf - %k1 = pto.alloc_tile : !pto.tile_buf - %k2 = pto.alloc_tile : !pto.tile_buf - %k3 = pto.alloc_tile : !pto.tile_buf - %k4 = pto.alloc_tile : !pto.tile_buf - %k5 = pto.alloc_tile : !pto.tile_buf - %k6 = pto.alloc_tile : !pto.tile_buf - %k7 = pto.alloc_tile : !pto.tile_buf - %k8 = pto.alloc_tile : !pto.tile_buf - %k9 = pto.alloc_tile : !pto.tile_buf - %k10 = pto.alloc_tile : !pto.tile_buf - %k11 = pto.alloc_tile : !pto.tile_buf - %k12 = pto.alloc_tile : !pto.tile_buf - %k13 = pto.alloc_tile : !pto.tile_buf - %k14 = pto.alloc_tile : !pto.tile_buf - %k15 = pto.alloc_tile : !pto.tile_buf - %k16 = pto.alloc_tile : !pto.tile_buf - %k17 = pto.alloc_tile : !pto.tile_buf - %k18 = pto.alloc_tile : !pto.tile_buf - %k19 = pto.alloc_tile : !pto.tile_buf - %k20 = pto.alloc_tile : !pto.tile_buf - %k21 = pto.alloc_tile : !pto.tile_buf - - // Touch all k-buffers early so their lifetimes start before the temps. - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k0 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k1 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k2 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k3 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k4 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k5 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k6 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k7 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k8 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k9 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k10 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k11 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k12 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k13 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k14 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k15 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k16 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k17 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k18 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k19 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k20 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%k21 : !pto.tile_buf) - - // Stage 1: two overlapping temps. - %a0 = pto.alloc_tile : !pto.tile_buf - %b0 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%a0 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%b0 : !pto.tile_buf) - pto.tstore ins(%a0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%b0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - // Stage 2: two overlapping temps again, which should reuse the same two slots. - %a1 = pto.alloc_tile : !pto.tile_buf - %b1 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%a1 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%b1 : !pto.tile_buf) - pto.tstore ins(%a1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%b1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - // Keep all k-buffers live until the end. - pto.tstore ins(%k0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k2 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k3 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k4 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k5 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k6 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k7 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k8 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k9 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k10 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k11 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k12 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k13 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k14 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k15 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k16 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k17 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k18 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k19 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k20 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%k21 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - return - } -} - -// With 22 live UB buffers, there are exactly two remaining 8192B slots at -// offsets 22*8192 and 23*8192, reused across the two stages. -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_if_in_loop.py b/test/samples/planmemory/plan_memory_if_in_loop.py deleted file mode 100644 index dccdbe2d..00000000 --- a/test/samples/planmemory/plan_memory_if_in_loop.py +++ /dev/null @@ -1,32 +0,0 @@ -PTO_IR = r""" - -module { - func.func @if_in_loop(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - - scf.for %i = %c0 to %c2 step %c1 { - %is0 = arith.cmpi eq, %i, %c0 : index - scf.if %is0 { - %a = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%a : !pto.tile_buf) - pto.tstore ins(%a : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } else { - %b = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%b : !pto.tile_buf) - pto.tstore ins(%b : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - } - return - } -} -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_if_yield.py b/test/samples/planmemory/plan_memory_if_yield.py deleted file mode 100644 index 3a1bbb8d..00000000 --- a/test/samples/planmemory/plan_memory_if_yield.py +++ /dev/null @@ -1,29 +0,0 @@ -PTO_IR = r""" - -module { - func.func @if_yield(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %cond = arith.constant true - // Use scf.if control-flow without yielding a memref (the current emitc - // lowering can't handle memref-typed scf.if results), but still ensure - // PlanMemory rewrites allocs inside both branches. - scf.if %cond { - %then = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%then : !pto.tile_buf) - pto.tstore ins(%then : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } else { - %els = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%els : !pto.tile_buf) - pto.tstore ins(%els : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - return - } -} -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_loop_in_if.py b/test/samples/planmemory/plan_memory_loop_in_if.py deleted file mode 100644 index 1b4a4602..00000000 --- a/test/samples/planmemory/plan_memory_loop_in_if.py +++ /dev/null @@ -1,32 +0,0 @@ -PTO_IR = r""" - -module { - func.func @loop_in_if(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %true = arith.constant true - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - - scf.if %true { - scf.for %i = %c0 to %c2 step %c1 { - %a = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%a : !pto.tile_buf) - pto.tstore ins(%a : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - } else { - %b = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%b : !pto.tile_buf) - pto.tstore ins(%b : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - return - } -} -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py b/test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py deleted file mode 100644 index 76207643..00000000 --- a/test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py +++ /dev/null @@ -1,36 +0,0 @@ -PTO_IR = r""" - -module { - func.func @loop_outer_live(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - - // A buffer that remains live across the loop (used after the loop). - %outer = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%outer : !pto.tile_buf) - - // A loop-local buffer used inside the loop. - %inner = pto.alloc_tile : !pto.tile_buf - scf.for %i = %c0 to %c4 step %c1 { - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%inner : !pto.tile_buf) - pto.tstore ins(%inner : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - - // Use %outer after the loop to keep it live across the loop. - pto.tstore ins(%outer : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - return - } -} - -// Expect a loop, and two planned buffers at distinct offsets. -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_nested_loops.py b/test/samples/planmemory/plan_memory_nested_loops.py deleted file mode 100644 index 574c7019..00000000 --- a/test/samples/planmemory/plan_memory_nested_loops.py +++ /dev/null @@ -1,40 +0,0 @@ -PTO_IR = r""" - -module { - func.func @nested_loops(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - - // Outer buffer stays live across both loops (used after). - %outer = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%outer : !pto.tile_buf) - - scf.for %i = %c0 to %c2 step %c1 { - // Buffer allocated inside the outer loop. - %mid = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%mid : !pto.tile_buf) - scf.for %j = %c0 to %c2 step %c1 { - // A temp buffer in the inner loop. - %tmp = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%tmp : !pto.tile_buf) - pto.tstore ins(%tmp : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - pto.tstore ins(%mid : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - } - - pto.tstore ins(%outer : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - return - } -} -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_no_reuse_overlap.py b/test/samples/planmemory/plan_memory_no_reuse_overlap.py deleted file mode 100644 index 2bcabe67..00000000 --- a/test/samples/planmemory/plan_memory_no_reuse_overlap.py +++ /dev/null @@ -1,27 +0,0 @@ -PTO_IR = r""" - -module { - func.func @no_reuse_overlap(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %ub0 = pto.alloc_tile : !pto.tile_buf - %ub1 = pto.alloc_tile : !pto.tile_buf - - // Make lifetimes overlap by using both buffers after both are created. - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub0 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub1 : !pto.tile_buf) - pto.tstore ins(%ub0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%ub1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - return - } -} - -// With overlapping lifetimes, offsets must differ. -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_peak_8_overlapping.py b/test/samples/planmemory/plan_memory_peak_8_overlapping.py deleted file mode 100644 index 863aa4c0..00000000 --- a/test/samples/planmemory/plan_memory_peak_8_overlapping.py +++ /dev/null @@ -1,57 +0,0 @@ -PTO_IR = r""" - -module { - func.func @peak_8_overlapping(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - // Peak liveness: 8 buffers live at once. - %u0 = pto.alloc_tile : !pto.tile_buf - %u1 = pto.alloc_tile : !pto.tile_buf - %u2 = pto.alloc_tile : !pto.tile_buf - %u3 = pto.alloc_tile : !pto.tile_buf - %u4 = pto.alloc_tile : !pto.tile_buf - %u5 = pto.alloc_tile : !pto.tile_buf - %u6 = pto.alloc_tile : !pto.tile_buf - %u7 = pto.alloc_tile : !pto.tile_buf - - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u0 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u1 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u2 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u3 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u4 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u5 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u6 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u7 : !pto.tile_buf) - - pto.tstore ins(%u0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u2 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u3 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u4 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u5 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u6 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u7 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - return - } -} - -// 8 live buffers implies a max offset of 7*8192 = 57344 bytes. -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_peak_exact_capacity.py b/test/samples/planmemory/plan_memory_peak_exact_capacity.py deleted file mode 100644 index d8e260f7..00000000 --- a/test/samples/planmemory/plan_memory_peak_exact_capacity.py +++ /dev/null @@ -1,139 +0,0 @@ -PTO_IR = r""" - -module { - func.func @peak_exact_capacity(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - // Default UB size is 1572864 bits (196608 bytes). Each buffer here is - // 16*16*16*f16 = 8192 bytes. 24 buffers live at once should fit exactly: - // 24 * 8192 = 196608 bytes. - %u0 = pto.alloc_tile : !pto.tile_buf - %u1 = pto.alloc_tile : !pto.tile_buf - %u2 = pto.alloc_tile : !pto.tile_buf - %u3 = pto.alloc_tile : !pto.tile_buf - %u4 = pto.alloc_tile : !pto.tile_buf - %u5 = pto.alloc_tile : !pto.tile_buf - %u6 = pto.alloc_tile : !pto.tile_buf - %u7 = pto.alloc_tile : !pto.tile_buf - %u8 = pto.alloc_tile : !pto.tile_buf - %u9 = pto.alloc_tile : !pto.tile_buf - %u10 = pto.alloc_tile : !pto.tile_buf - %u11 = pto.alloc_tile : !pto.tile_buf - %u12 = pto.alloc_tile : !pto.tile_buf - %u13 = pto.alloc_tile : !pto.tile_buf - %u14 = pto.alloc_tile : !pto.tile_buf - %u15 = pto.alloc_tile : !pto.tile_buf - %u16 = pto.alloc_tile : !pto.tile_buf - %u17 = pto.alloc_tile : !pto.tile_buf - %u18 = pto.alloc_tile : !pto.tile_buf - %u19 = pto.alloc_tile : !pto.tile_buf - %u20 = pto.alloc_tile : !pto.tile_buf - %u21 = pto.alloc_tile : !pto.tile_buf - %u22 = pto.alloc_tile : !pto.tile_buf - %u23 = pto.alloc_tile : !pto.tile_buf - - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u0 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u1 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u2 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u3 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u4 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u5 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u6 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u7 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u8 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u9 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u10 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u11 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u12 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u13 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u14 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u15 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u16 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u17 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u18 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u19 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u20 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u21 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u22 : !pto.tile_buf) - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%u23 : !pto.tile_buf) - - pto.tstore ins(%u0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u2 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u3 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u4 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u5 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u6 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u7 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u8 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u9 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u10 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u11 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u12 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u13 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u14 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u15 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u16 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u17 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u18 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u19 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u20 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u21 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u22 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - pto.tstore ins(%u23 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - return - } -} - -// 24 live buffers implies a max offset of 23*8192 = 188416 bytes. -""" - -if __name__ == "__main__": - print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_reuse_sequential.py b/test/samples/planmemory/plan_memory_reuse_sequential.py deleted file mode 100644 index 1114f851..00000000 --- a/test/samples/planmemory/plan_memory_reuse_sequential.py +++ /dev/null @@ -1,200 +0,0 @@ -PTO_IR = r""" - -module { - func.func @reuse_sequential(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - // Force reuse: - // UB capacity (default) is 1572864 bits (196608 bytes). Each buffer here is - // 16*16*16*f16 = 8192 bytes. Allocating 30 such buffers exceeds UB capacity - // unless memory reuse is applied. - %ub0 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub0 : !pto.tile_buf) - pto.tstore ins(%ub0 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub1 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub1 : !pto.tile_buf) - pto.tstore ins(%ub1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub2 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub2 : !pto.tile_buf) - pto.tstore ins(%ub2 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub3 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub3 : !pto.tile_buf) - pto.tstore ins(%ub3 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub4 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub4 : !pto.tile_buf) - pto.tstore ins(%ub4 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub5 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub5 : !pto.tile_buf) - pto.tstore ins(%ub5 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub6 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub6 : !pto.tile_buf) - pto.tstore ins(%ub6 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub7 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub7 : !pto.tile_buf) - pto.tstore ins(%ub7 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub8 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub8 : !pto.tile_buf) - pto.tstore ins(%ub8 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub9 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub9 : !pto.tile_buf) - pto.tstore ins(%ub9 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub10 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub10 : !pto.tile_buf) - pto.tstore ins(%ub10 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub11 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub11 : !pto.tile_buf) - pto.tstore ins(%ub11 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub12 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub12 : !pto.tile_buf) - pto.tstore ins(%ub12 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub13 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub13 : !pto.tile_buf) - pto.tstore ins(%ub13 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub14 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub14 : !pto.tile_buf) - pto.tstore ins(%ub14 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub15 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub15 : !pto.tile_buf) - pto.tstore ins(%ub15 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub16 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub16 : !pto.tile_buf) - pto.tstore ins(%ub16 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub17 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub17 : !pto.tile_buf) - pto.tstore ins(%ub17 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub18 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub18 : !pto.tile_buf) - pto.tstore ins(%ub18 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub19 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub19 : !pto.tile_buf) - pto.tstore ins(%ub19 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub20 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub20 : !pto.tile_buf) - pto.tstore ins(%ub20 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub21 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub21 : !pto.tile_buf) - pto.tstore ins(%ub21 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub22 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub22 : !pto.tile_buf) - pto.tstore ins(%ub22 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub23 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub23 : !pto.tile_buf) - pto.tstore ins(%ub23 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub24 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub24 : !pto.tile_buf) - pto.tstore ins(%ub24 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub25 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub25 : !pto.tile_buf) - pto.tstore ins(%ub25 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub26 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub26 : !pto.tile_buf) - pto.tstore ins(%ub26 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub27 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub27 : !pto.tile_buf) - pto.tstore ins(%ub27 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub28 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub28 : !pto.tile_buf) - pto.tstore ins(%ub28 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - %ub29 = pto.alloc_tile : !pto.tile_buf - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub29 : !pto.tile_buf) - pto.tstore ins(%ub29 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - return - } -} - -// Anchor checks after the PlanMemory debug marker (ptoas prints the module -// before and after planning). -// Expect at least two distinct allocations to reuse offset 0. -""" - -if __name__ == "__main__": - print(PTO_IR) From ea851ed52c107539f87486dd31afbb8fbe4440c0 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 21:11:22 +0800 Subject: [PATCH 07/14] Derive SOC_VERSION from PTO_ARCH in remote validation --- .../scripts/run_remote_npu_validation.sh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 92d3122c..0e5283d0 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -123,10 +123,10 @@ export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" pto_arch_lc="$(printf '%s' "${PTO_ARCH}" | tr '[:upper:]' '[:lower:]')" case "${pto_arch_lc}" in - a5) SOC_VERSION="a5" ;; - a3) SOC_VERSION="a3" ;; + a5) SOC_VERSION="Ascend910_95" ;; + a3) SOC_VERSION="Ascend910B" ;; *) - SOC_VERSION="a3" + SOC_VERSION="Ascend910B" pto_arch_lc="a3" ;; esac @@ -212,16 +212,12 @@ while IFS= read -r -d '' cpp; do nv_dir="${OUTPUT_ROOT}/${sample_name}/${testcase}" set +e - pto_arch_args=() - if [[ -n "${PTO_ARCH}" ]]; then - pto_arch_args+=(--pto-arch "${PTO_ARCH}") - fi python3 "${ROOT_DIR}/test/npu_validation/scripts/generate_testcase.py" \ --input "${cpp}" \ --testcase "${testcase}" \ --output-root "${OUTPUT_ROOT}" \ --run-mode "${RUN_MODE}" \ - "${pto_arch_args[@]}" + --pto-arch "${PTO_ARCH}" gen_rc=$? set -euo pipefail if [[ $gen_rc -ne 0 ]]; then From 57e6d9c4fd190174c771dd48153cdec4132c3d26 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Tue, 10 Mar 2026 21:13:11 +0800 Subject: [PATCH 08/14] Restore SIM_SOC_VERSION Ascend910 mapping --- test/npu_validation/scripts/run_remote_npu_validation.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 0e5283d0..8a6574e3 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -132,6 +132,15 @@ case "${pto_arch_lc}" in esac SIM_SOC_VERSION="${SOC_VERSION}" +# Some CANN installs do not provide a simulator directory named exactly +# "Ascend910". Map it to a real directory so we can link/run camodel. +if [[ "${SOC_VERSION}" == "Ascend910" ]]; then + if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then + SIM_SOC_VERSION="Ascend910A" + elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then + SIM_SOC_VERSION="Ascend910ProA" + fi +fi log "SIM_SOC_VERSION=${SIM_SOC_VERSION}" LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH}" From 94dbadf889bc56665097489f157acea33da1be15 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Thu, 12 Mar 2026 20:05:41 +0800 Subject: [PATCH 09/14] Revert planmemory changes --- lib/PTO/Transforms/PTOPlanMemory.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp index 24d24171..257410f2 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.cpp +++ b/lib/PTO/Transforms/PTOPlanMemory.cpp @@ -123,6 +123,10 @@ void MemLivenessAnalysis::RecursionIR(Region *region, Liveness live) { // return WalkResult::advance(); } else if (auto loadOp = dyn_cast(op)) { OpKillHandle(curOpInfo, live, op->getBlock()); + } else if (auto getValDpsOp = dyn_cast(op)) { + // GetValDpsOp only reads from buffer, similar to LoadOp. + (void)getValDpsOp; + OpKillHandle(curOpInfo, live, op->getBlock()); } else if (auto tprintOp = dyn_cast(op)) { // TPrintOp only reads from buffer, similar to LoadOp OpKillHandle(curOpInfo, live, op->getBlock()); @@ -752,8 +756,7 @@ void MemPlan::PrintSuccessfulAllocatedMaxBits() { for (auto& child : it->second->mergedChildren) { ubAllocBits = std::max(ubAllocBits, child->bitsOffset + child->alignedConstBits); } - llvm::outs() << "[PTOPlanMemory] Allocated UB size = " << ubAllocBits - << " bits\n"; + llvm::outs() << "[AscendNPU IR] Allocated UB size = " << ubAllocBits << " bits "<< "\n"; } } From 07ab754f3468f426ca81e6b597d9135ccd6c595b Mon Sep 17 00:00:00 2001 From: PTOAS Date: Thu, 12 Mar 2026 20:27:53 +0800 Subject: [PATCH 10/14] Restore plan_memory tests --- .../plan_memory_bind_tile_alias_liveness.mlir | 35 +++ .../plan_memory_for_iter_args_yield.mlir | 34 +++ .../plan_memory_fragmentation_hole_fit.mlir | 151 +++++++++++++ .../plan_memory_fragmentation_two_holes.mlir | 162 ++++++++++++++ test/basic/plan_memory_if_in_loop.mlir | 37 ++++ test/basic/plan_memory_if_yield.mlir | 33 +++ test/basic/plan_memory_loop_in_if.mlir | 37 ++++ .../plan_memory_loop_no_reuse_outer_live.mlir | 40 ++++ test/basic/plan_memory_nested_loops.mlir | 45 ++++ test/basic/plan_memory_no_reuse_overlap.mlir | 30 +++ .../basic/plan_memory_peak_8_overlapping.mlir | 59 +++++ .../plan_memory_peak_exact_capacity.mlir | 141 ++++++++++++ test/basic/plan_memory_reuse_sequential.mlir | 202 ++++++++++++++++++ .../basic/plan_memory_scopes_independent.mlir | 29 +++ .../plan_memory_bind_tile_alias_liveness.py | 27 +++ .../plan_memory_for_iter_args_yield.py | 32 +++ .../plan_memory_fragmentation_hole_fit.py | 149 +++++++++++++ .../plan_memory_fragmentation_two_holes.py | 157 ++++++++++++++ .../planmemory/plan_memory_if_in_loop.py | 32 +++ .../planmemory/plan_memory_if_yield.py | 29 +++ .../planmemory/plan_memory_loop_in_if.py | 32 +++ .../plan_memory_loop_no_reuse_outer_live.py | 36 ++++ .../planmemory/plan_memory_nested_loops.py | 40 ++++ .../plan_memory_no_reuse_overlap.py | 27 +++ .../plan_memory_peak_8_overlapping.py | 57 +++++ .../plan_memory_peak_exact_capacity.py | 139 ++++++++++++ .../plan_memory_reuse_sequential.py | 200 +++++++++++++++++ .../plan_memory_scopes_independent.py | 26 +++ 28 files changed, 2018 insertions(+) create mode 100644 test/basic/plan_memory_bind_tile_alias_liveness.mlir create mode 100644 test/basic/plan_memory_for_iter_args_yield.mlir create mode 100644 test/basic/plan_memory_fragmentation_hole_fit.mlir create mode 100644 test/basic/plan_memory_fragmentation_two_holes.mlir create mode 100644 test/basic/plan_memory_if_in_loop.mlir create mode 100644 test/basic/plan_memory_if_yield.mlir create mode 100644 test/basic/plan_memory_loop_in_if.mlir create mode 100644 test/basic/plan_memory_loop_no_reuse_outer_live.mlir create mode 100644 test/basic/plan_memory_nested_loops.mlir create mode 100644 test/basic/plan_memory_no_reuse_overlap.mlir create mode 100644 test/basic/plan_memory_peak_8_overlapping.mlir create mode 100644 test/basic/plan_memory_peak_exact_capacity.mlir create mode 100644 test/basic/plan_memory_reuse_sequential.mlir create mode 100644 test/basic/plan_memory_scopes_independent.mlir create mode 100644 test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py create mode 100644 test/samples/planmemory/plan_memory_for_iter_args_yield.py create mode 100644 test/samples/planmemory/plan_memory_fragmentation_hole_fit.py create mode 100644 test/samples/planmemory/plan_memory_fragmentation_two_holes.py create mode 100644 test/samples/planmemory/plan_memory_if_in_loop.py create mode 100644 test/samples/planmemory/plan_memory_if_yield.py create mode 100644 test/samples/planmemory/plan_memory_loop_in_if.py create mode 100644 test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py create mode 100644 test/samples/planmemory/plan_memory_nested_loops.py create mode 100644 test/samples/planmemory/plan_memory_no_reuse_overlap.py create mode 100644 test/samples/planmemory/plan_memory_peak_8_overlapping.py create mode 100644 test/samples/planmemory/plan_memory_peak_exact_capacity.py create mode 100644 test/samples/planmemory/plan_memory_reuse_sequential.py create mode 100644 test/samples/planmemory/plan_memory_scopes_independent.py diff --git a/test/basic/plan_memory_bind_tile_alias_liveness.mlir b/test/basic/plan_memory_bind_tile_alias_liveness.mlir new file mode 100644 index 00000000..d58be111 --- /dev/null +++ b/test/basic/plan_memory_bind_tile_alias_liveness.mlir @@ -0,0 +1,35 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @bind_tile_alias_liveness(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %c16 = arith.constant 16 : index + + %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %av = pto.bind_tile %a, %c16, %c16 + {config = #pto.tile_buf_config} + : memref<16x16x16xf16, #pto.address_space> -> memref<16x16x16xf16, #pto.address_space> + + %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%b : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + // Using %av should keep %a live; %b must not reuse %a's offset. + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%av : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%av : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @bind_tile_alias_liveness +// CHECK-NOT: memref.alloc +// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 +// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 +// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%c8192_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> + diff --git a/test/basic/plan_memory_for_iter_args_yield.mlir b/test/basic/plan_memory_for_iter_args_yield.mlir new file mode 100644 index 00000000..c6254e46 --- /dev/null +++ b/test/basic/plan_memory_for_iter_args_yield.mlir @@ -0,0 +1,34 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @for_iter_args_yield(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + scf.for %i = %c0 to %c2 step %c1 { + // Two allocs inside the loop body exercise liveness within loops and + // per-iteration memory planning for overlapping lifetimes. + %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%a : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%b : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%a : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @for_iter_args_yield +// CHECK-NOT: memref.alloc +// CHECK: scf.for +// After matching the loop header, require at least one planned buffer inside +// the loop body (the inner alloc becomes a pointer_cast). +// CHECK: pto.pointer_cast +// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_fragmentation_hole_fit.mlir b/test/basic/plan_memory_fragmentation_hole_fit.mlir new file mode 100644 index 00000000..7b9a5690 --- /dev/null +++ b/test/basic/plan_memory_fragmentation_hole_fit.mlir @@ -0,0 +1,151 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @fragmentation_hole_fit(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + // Force a tight situation: + // - Keep 23 UB buffers live across the function (23 * 8192 = 188416 B). + // - Only 1 UB slot remains. Two short-lived buffers must reuse that slot. + %k0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k22 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + + // Touch all k-buffers early so their lifetimes start before the temps. + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k0 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k1 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k2 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k3 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k4 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k5 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k6 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k7 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k8 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k9 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k10 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k11 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k12 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k13 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k14 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k15 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k16 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k17 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k18 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k19 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k20 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k21 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k22 : memref<16x16x16xf16, #pto.address_space>) + + %t0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%t0 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%t0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %t1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%t1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%t1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + // Keep all k-buffers live until the end. + pto.tstore ins(%k0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k2 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k3 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k4 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k5 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k6 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k7 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k8 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k9 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k10 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k11 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k12 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k13 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k14 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k15 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k16 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k17 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k18 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k19 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k20 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k21 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k22 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @fragmentation_hole_fit +// CHECK-NOT: memref.alloc +// With 23 live UB buffers, there is exactly one remaining 8192B slot. The two +// short-lived buffers must reuse that slot (offset 23*8192 = 188416). +// CHECK-DAG: %[[O188416:.*]] = arith.constant 188416 : i64 +// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/basic/plan_memory_fragmentation_two_holes.mlir b/test/basic/plan_memory_fragmentation_two_holes.mlir new file mode 100644 index 00000000..f1a817d9 --- /dev/null +++ b/test/basic/plan_memory_fragmentation_two_holes.mlir @@ -0,0 +1,162 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @fragmentation_two_holes(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + // Tight + overlap: + // - Keep 22 UB buffers live (22 * 8192 = 180224 B), leaving 2 free slots. + // - Allocate 2 short-lived buffers with overlapping lifetimes twice. + // The two free slots are at offsets 22*8192 and 23*8192. + %k0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %k21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + + // Touch all k-buffers early so their lifetimes start before the temps. + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k0 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k1 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k2 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k3 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k4 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k5 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k6 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k7 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k8 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k9 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k10 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k11 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k12 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k13 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k14 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k15 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k16 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k17 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k18 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k19 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k20 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%k21 : memref<16x16x16xf16, #pto.address_space>) + + // Stage 1: two overlapping temps. + %a0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %b0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%a0 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%b0 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%a0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%b0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + // Stage 2: two overlapping temps again, which should reuse the same two slots. + %a1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %b1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%a1 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%b1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%a1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%b1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + // Keep all k-buffers live until the end. + pto.tstore ins(%k0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k2 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k3 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k4 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k5 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k6 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k7 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k8 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k9 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k10 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k11 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k12 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k13 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k14 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k15 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k16 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k17 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k18 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k19 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k20 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%k21 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @fragmentation_two_holes +// CHECK-NOT: memref.alloc +// With 22 live UB buffers, there are exactly two remaining 8192B slots at +// offsets 22*8192 and 23*8192, reused across the two stages. +// CHECK-DAG: %[[O180224:.*]] = arith.constant 180224 : i64 +// CHECK-DAG: %[[O188416:.*]] = arith.constant 188416 : i64 +// CHECK-DAG: pto.pointer_cast(%[[O180224]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%[[O180224]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/basic/plan_memory_if_in_loop.mlir b/test/basic/plan_memory_if_in_loop.mlir new file mode 100644 index 00000000..accc16a1 --- /dev/null +++ b/test/basic/plan_memory_if_in_loop.mlir @@ -0,0 +1,37 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @if_in_loop(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + scf.for %i = %c0 to %c2 step %c1 { + %is0 = arith.cmpi eq, %i, %c0 : index + scf.if %is0 { + %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%a : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%a : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } else { + %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%b : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + } + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @if_in_loop +// CHECK-NOT: memref.alloc +// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 +// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 +// CHECK: scf.for +// CHECK: scf.if +// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_if_yield.mlir b/test/basic/plan_memory_if_yield.mlir new file mode 100644 index 00000000..bd9b48b2 --- /dev/null +++ b/test/basic/plan_memory_if_yield.mlir @@ -0,0 +1,33 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @if_yield(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %cond = arith.constant true + // Use scf.if control-flow without yielding a memref (the current emitc + // lowering can't handle memref-typed scf.if results), but still ensure + // PlanMemory rewrites allocs inside both branches. + scf.if %cond { + %then = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%then : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%then : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } else { + %els = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%els : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%els : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @if_yield +// CHECK-NOT: memref.alloc +// CHECK: scf.if +// CHECK: pto.pointer_cast +// CHECK: } else { +// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_loop_in_if.mlir b/test/basic/plan_memory_loop_in_if.mlir new file mode 100644 index 00000000..31d2a970 --- /dev/null +++ b/test/basic/plan_memory_loop_in_if.mlir @@ -0,0 +1,37 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @loop_in_if(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %true = arith.constant true + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + scf.if %true { + scf.for %i = %c0 to %c2 step %c1 { + %a = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%a : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%a : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + } else { + %b = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%b : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%b : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @loop_in_if +// CHECK-NOT: memref.alloc +// CHECK: scf.if +// CHECK: scf.for +// CHECK: } else { +// CHECK: pto.pointer_cast + diff --git a/test/basic/plan_memory_loop_no_reuse_outer_live.mlir b/test/basic/plan_memory_loop_no_reuse_outer_live.mlir new file mode 100644 index 00000000..f08bc9f5 --- /dev/null +++ b/test/basic/plan_memory_loop_no_reuse_outer_live.mlir @@ -0,0 +1,40 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @loop_outer_live(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + + // A buffer that remains live across the loop (used after the loop). + %outer = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%outer : memref<16x16x16xf16, #pto.address_space>) + + // A loop-local buffer used inside the loop. + %inner = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + scf.for %i = %c0 to %c4 step %c1 { + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%inner : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%inner : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + + // Use %outer after the loop to keep it live across the loop. + pto.tstore ins(%outer : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @loop_outer_live +// CHECK-NOT: memref.alloc +// Expect a loop, and two planned buffers at distinct offsets. +// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 +// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 +// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%c8192_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK: scf.for diff --git a/test/basic/plan_memory_nested_loops.mlir b/test/basic/plan_memory_nested_loops.mlir new file mode 100644 index 00000000..0301039a --- /dev/null +++ b/test/basic/plan_memory_nested_loops.mlir @@ -0,0 +1,45 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @nested_loops(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + // Outer buffer stays live across both loops (used after). + %outer = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%outer : memref<16x16x16xf16, #pto.address_space>) + + scf.for %i = %c0 to %c2 step %c1 { + // Buffer allocated inside the outer loop. + %mid = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%mid : memref<16x16x16xf16, #pto.address_space>) + scf.for %j = %c0 to %c2 step %c1 { + // A temp buffer in the inner loop. + %tmp = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%tmp : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%tmp : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + pto.tstore ins(%mid : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + } + + pto.tstore ins(%outer : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @nested_loops +// CHECK-NOT: memref.alloc +// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 +// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 +// CHECK: scf.for +// CHECK: scf.for +// CHECK: pto.pointer_cast diff --git a/test/basic/plan_memory_no_reuse_overlap.mlir b/test/basic/plan_memory_no_reuse_overlap.mlir new file mode 100644 index 00000000..b7a6e0d9 --- /dev/null +++ b/test/basic/plan_memory_no_reuse_overlap.mlir @@ -0,0 +1,30 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @no_reuse_overlap(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %ub0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %ub1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + + // Make lifetimes overlap by using both buffers after both are created. + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub0 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @no_reuse_overlap +// CHECK-NOT: memref.alloc +// With overlapping lifetimes, offsets must differ. +// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 +// CHECK-DAG: %c8192_i64 = arith.constant 8192 : i64 +// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%c8192_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/basic/plan_memory_peak_8_overlapping.mlir b/test/basic/plan_memory_peak_8_overlapping.mlir new file mode 100644 index 00000000..b22b54fb --- /dev/null +++ b/test/basic/plan_memory_peak_8_overlapping.mlir @@ -0,0 +1,59 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @peak_8_overlapping(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + // Peak liveness: 8 buffers live at once. + %u0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u0 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u1 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u2 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u3 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u4 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u5 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u6 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u7 : memref<16x16x16xf16, #pto.address_space>) + + pto.tstore ins(%u0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u2 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u3 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u4 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u5 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u6 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u7 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @peak_8_overlapping +// CHECK-NOT: memref.alloc +// 8 live buffers implies a max offset of 7*8192 = 57344 bytes. +// CHECK: %[[O57344:.*]] = arith.constant 57344 : i64 +// CHECK: pto.pointer_cast(%[[O57344]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> + diff --git a/test/basic/plan_memory_peak_exact_capacity.mlir b/test/basic/plan_memory_peak_exact_capacity.mlir new file mode 100644 index 00000000..eaef6d40 --- /dev/null +++ b/test/basic/plan_memory_peak_exact_capacity.mlir @@ -0,0 +1,141 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @peak_exact_capacity(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + // Default UB size is 1572864 bits (196608 bytes). Each buffer here is + // 16*16*16*f16 = 8192 bytes. 24 buffers live at once should fit exactly: + // 24 * 8192 = 196608 bytes. + %u0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u22 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %u23 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u0 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u1 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u2 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u3 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u4 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u5 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u6 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u7 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u8 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u9 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u10 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u11 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u12 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u13 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u14 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u15 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u16 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u17 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u18 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u19 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u20 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u21 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u22 : memref<16x16x16xf16, #pto.address_space>) + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%u23 : memref<16x16x16xf16, #pto.address_space>) + + pto.tstore ins(%u0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u2 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u3 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u4 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u5 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u6 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u7 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u8 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u9 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u10 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u11 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u12 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u13 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u14 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u15 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u16 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u17 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u18 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u19 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u20 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u21 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u22 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%u23 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @peak_exact_capacity +// CHECK-NOT: memref.alloc +// 24 live buffers implies a max offset of 23*8192 = 188416 bytes. +// CHECK: %[[O188416:.*]] = arith.constant 188416 : i64 +// CHECK: pto.pointer_cast(%[[O188416]]) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> + diff --git a/test/basic/plan_memory_reuse_sequential.mlir b/test/basic/plan_memory_reuse_sequential.mlir new file mode 100644 index 00000000..20ff62ab --- /dev/null +++ b/test/basic/plan_memory_reuse_sequential.mlir @@ -0,0 +1,202 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @reuse_sequential(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + // Force reuse: + // UB capacity (default) is 1572864 bits (196608 bytes). Each buffer here is + // 16*16*16*f16 = 8192 bytes. Allocating 30 such buffers exceeds UB capacity + // unless memory reuse is applied. + %ub0 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub0 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub0 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub2 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub2 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub2 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub3 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub3 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub3 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub4 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub4 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub4 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub5 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub5 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub5 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub6 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub6 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub6 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub7 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub7 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub7 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub8 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub8 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub8 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub9 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub9 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub9 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub10 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub10 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub10 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub11 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub11 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub11 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub12 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub12 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub12 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub13 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub13 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub13 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub14 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub14 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub14 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub15 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub15 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub15 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub16 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub16 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub16 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub17 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub17 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub17 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub18 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub18 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub18 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub19 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub19 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub19 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub20 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub20 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub20 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub21 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub21 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub21 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub22 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub22 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub22 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub23 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub23 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub23 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub24 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub24 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub24 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub25 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub25 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub25 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub26 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub26 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub26 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub27 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub27 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub27 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub28 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub28 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub28 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + %ub29 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub29 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub29 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + return + } +} + +// Anchor checks after the PlanMemory debug marker (ptoas prints the module +// before and after planning). +// CHECK: end PTO plan Mem! +// CHECK: func.func @reuse_sequential +// CHECK-NOT: memref.alloc +// Expect at least two distinct allocations to reuse offset 0. +// CHECK: %c0_i64 = arith.constant 0 : i64 +// CHECK: %[[BUF0:.*]] = pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK: %[[BUF1:.*]] = pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> diff --git a/test/basic/plan_memory_scopes_independent.mlir b/test/basic/plan_memory_scopes_independent.mlir new file mode 100644 index 00000000..b95b8c7e --- /dev/null +++ b/test/basic/plan_memory_scopes_independent.mlir @@ -0,0 +1,29 @@ +// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s + +module { + func.func @scopes_independent(%arg0: memref<16x16x16xf16, #pto.address_space>, + %arg1: memref<16x16x16xf16, #pto.address_space>) { + %ub = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + %l1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> + + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%ub : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%ub : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + + pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) + outs(%l1 : memref<16x16x16xf16, #pto.address_space>) + pto.tstore ins(%l1 : memref<16x16x16xf16, #pto.address_space>) + outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) + return + } +} + +// CHECK: end PTO plan Mem! +// CHECK: func.func @scopes_independent +// CHECK-NOT: memref.alloc +// Offsets are planned per-scope, so both UB and L1 can start at 0. +// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 +// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> +// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space> + diff --git a/test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py b/test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py new file mode 100644 index 00000000..fd92e85a --- /dev/null +++ b/test/samples/planmemory/plan_memory_bind_tile_alias_liveness.py @@ -0,0 +1,27 @@ +PTO_IR = r""" + +module { + func.func @bind_tile_alias_liveness(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %c16 = arith.constant 16 : index + + %a = pto.alloc_tile : !pto.tile_buf + + %b = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%b : !pto.tile_buf) + pto.tstore ins(%b : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + // Using %a should keep %a live; %b must not reuse %a's offset. + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%a : !pto.tile_buf) + pto.tstore ins(%a : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + return + } +} +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_for_iter_args_yield.py b/test/samples/planmemory/plan_memory_for_iter_args_yield.py new file mode 100644 index 00000000..f4d9d7f7 --- /dev/null +++ b/test/samples/planmemory/plan_memory_for_iter_args_yield.py @@ -0,0 +1,32 @@ +PTO_IR = r""" + +module { + func.func @for_iter_args_yield(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + scf.for %i = %c0 to %c2 step %c1 { + // Two allocs inside the loop body exercise liveness within loops and + // per-iteration memory planning for overlapping lifetimes. + %a = pto.alloc_tile : !pto.tile_buf + %b = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%a : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%b : !pto.tile_buf) + pto.tstore ins(%a : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%b : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + return + } +} + +// After matching the loop header, require at least one planned buffer inside +// the loop body (the inner alloc becomes a pointer_cast). +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_fragmentation_hole_fit.py b/test/samples/planmemory/plan_memory_fragmentation_hole_fit.py new file mode 100644 index 00000000..1447ed2f --- /dev/null +++ b/test/samples/planmemory/plan_memory_fragmentation_hole_fit.py @@ -0,0 +1,149 @@ +PTO_IR = r""" + +module { + func.func @fragmentation_hole_fit(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + // Force a tight situation: + // - Keep 23 UB buffers live across the function (23 * 8192 = 188416 B). + // - Only 1 UB slot remains. Two short-lived buffers must reuse that slot. + %k0 = pto.alloc_tile : !pto.tile_buf + %k1 = pto.alloc_tile : !pto.tile_buf + %k2 = pto.alloc_tile : !pto.tile_buf + %k3 = pto.alloc_tile : !pto.tile_buf + %k4 = pto.alloc_tile : !pto.tile_buf + %k5 = pto.alloc_tile : !pto.tile_buf + %k6 = pto.alloc_tile : !pto.tile_buf + %k7 = pto.alloc_tile : !pto.tile_buf + %k8 = pto.alloc_tile : !pto.tile_buf + %k9 = pto.alloc_tile : !pto.tile_buf + %k10 = pto.alloc_tile : !pto.tile_buf + %k11 = pto.alloc_tile : !pto.tile_buf + %k12 = pto.alloc_tile : !pto.tile_buf + %k13 = pto.alloc_tile : !pto.tile_buf + %k14 = pto.alloc_tile : !pto.tile_buf + %k15 = pto.alloc_tile : !pto.tile_buf + %k16 = pto.alloc_tile : !pto.tile_buf + %k17 = pto.alloc_tile : !pto.tile_buf + %k18 = pto.alloc_tile : !pto.tile_buf + %k19 = pto.alloc_tile : !pto.tile_buf + %k20 = pto.alloc_tile : !pto.tile_buf + %k21 = pto.alloc_tile : !pto.tile_buf + %k22 = pto.alloc_tile : !pto.tile_buf + + // Touch all k-buffers early so their lifetimes start before the temps. + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k0 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k1 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k2 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k3 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k4 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k5 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k6 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k7 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k8 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k9 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k10 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k11 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k12 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k13 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k14 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k15 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k16 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k17 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k18 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k19 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k20 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k21 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k22 : !pto.tile_buf) + + %t0 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%t0 : !pto.tile_buf) + pto.tstore ins(%t0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %t1 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%t1 : !pto.tile_buf) + pto.tstore ins(%t1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + // Keep all k-buffers live until the end. + pto.tstore ins(%k0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k2 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k3 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k4 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k5 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k6 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k7 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k8 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k9 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k10 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k11 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k12 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k13 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k14 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k15 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k16 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k17 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k18 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k19 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k20 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k21 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k22 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + return + } +} + +// With 23 live UB buffers, there is exactly one remaining 8192B slot. The two +// short-lived buffers must reuse that slot (offset 23*8192 = 188416). +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_fragmentation_two_holes.py b/test/samples/planmemory/plan_memory_fragmentation_two_holes.py new file mode 100644 index 00000000..cf360b0a --- /dev/null +++ b/test/samples/planmemory/plan_memory_fragmentation_two_holes.py @@ -0,0 +1,157 @@ +PTO_IR = r""" + +module { + func.func @fragmentation_two_holes(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + // Tight + overlap: + // - Keep 22 UB buffers live (22 * 8192 = 180224 B), leaving 2 free slots. + // - Allocate 2 short-lived buffers with overlapping lifetimes twice. + // The two free slots are at offsets 22*8192 and 23*8192. + %k0 = pto.alloc_tile : !pto.tile_buf + %k1 = pto.alloc_tile : !pto.tile_buf + %k2 = pto.alloc_tile : !pto.tile_buf + %k3 = pto.alloc_tile : !pto.tile_buf + %k4 = pto.alloc_tile : !pto.tile_buf + %k5 = pto.alloc_tile : !pto.tile_buf + %k6 = pto.alloc_tile : !pto.tile_buf + %k7 = pto.alloc_tile : !pto.tile_buf + %k8 = pto.alloc_tile : !pto.tile_buf + %k9 = pto.alloc_tile : !pto.tile_buf + %k10 = pto.alloc_tile : !pto.tile_buf + %k11 = pto.alloc_tile : !pto.tile_buf + %k12 = pto.alloc_tile : !pto.tile_buf + %k13 = pto.alloc_tile : !pto.tile_buf + %k14 = pto.alloc_tile : !pto.tile_buf + %k15 = pto.alloc_tile : !pto.tile_buf + %k16 = pto.alloc_tile : !pto.tile_buf + %k17 = pto.alloc_tile : !pto.tile_buf + %k18 = pto.alloc_tile : !pto.tile_buf + %k19 = pto.alloc_tile : !pto.tile_buf + %k20 = pto.alloc_tile : !pto.tile_buf + %k21 = pto.alloc_tile : !pto.tile_buf + + // Touch all k-buffers early so their lifetimes start before the temps. + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k0 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k1 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k2 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k3 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k4 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k5 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k6 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k7 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k8 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k9 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k10 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k11 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k12 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k13 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k14 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k15 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k16 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k17 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k18 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k19 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k20 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%k21 : !pto.tile_buf) + + // Stage 1: two overlapping temps. + %a0 = pto.alloc_tile : !pto.tile_buf + %b0 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%a0 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%b0 : !pto.tile_buf) + pto.tstore ins(%a0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%b0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + // Stage 2: two overlapping temps again, which should reuse the same two slots. + %a1 = pto.alloc_tile : !pto.tile_buf + %b1 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%a1 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%b1 : !pto.tile_buf) + pto.tstore ins(%a1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%b1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + // Keep all k-buffers live until the end. + pto.tstore ins(%k0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k2 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k3 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k4 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k5 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k6 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k7 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k8 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k9 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k10 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k11 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k12 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k13 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k14 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k15 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k16 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k17 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k18 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k19 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k20 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%k21 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + return + } +} + +// With 22 live UB buffers, there are exactly two remaining 8192B slots at +// offsets 22*8192 and 23*8192, reused across the two stages. +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_if_in_loop.py b/test/samples/planmemory/plan_memory_if_in_loop.py new file mode 100644 index 00000000..dccdbe2d --- /dev/null +++ b/test/samples/planmemory/plan_memory_if_in_loop.py @@ -0,0 +1,32 @@ +PTO_IR = r""" + +module { + func.func @if_in_loop(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + scf.for %i = %c0 to %c2 step %c1 { + %is0 = arith.cmpi eq, %i, %c0 : index + scf.if %is0 { + %a = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%a : !pto.tile_buf) + pto.tstore ins(%a : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } else { + %b = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%b : !pto.tile_buf) + pto.tstore ins(%b : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + } + return + } +} +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_if_yield.py b/test/samples/planmemory/plan_memory_if_yield.py new file mode 100644 index 00000000..3a1bbb8d --- /dev/null +++ b/test/samples/planmemory/plan_memory_if_yield.py @@ -0,0 +1,29 @@ +PTO_IR = r""" + +module { + func.func @if_yield(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %cond = arith.constant true + // Use scf.if control-flow without yielding a memref (the current emitc + // lowering can't handle memref-typed scf.if results), but still ensure + // PlanMemory rewrites allocs inside both branches. + scf.if %cond { + %then = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%then : !pto.tile_buf) + pto.tstore ins(%then : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } else { + %els = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%els : !pto.tile_buf) + pto.tstore ins(%els : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + return + } +} +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_loop_in_if.py b/test/samples/planmemory/plan_memory_loop_in_if.py new file mode 100644 index 00000000..1b4a4602 --- /dev/null +++ b/test/samples/planmemory/plan_memory_loop_in_if.py @@ -0,0 +1,32 @@ +PTO_IR = r""" + +module { + func.func @loop_in_if(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %true = arith.constant true + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + scf.if %true { + scf.for %i = %c0 to %c2 step %c1 { + %a = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%a : !pto.tile_buf) + pto.tstore ins(%a : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + } else { + %b = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%b : !pto.tile_buf) + pto.tstore ins(%b : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + return + } +} +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py b/test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py new file mode 100644 index 00000000..76207643 --- /dev/null +++ b/test/samples/planmemory/plan_memory_loop_no_reuse_outer_live.py @@ -0,0 +1,36 @@ +PTO_IR = r""" + +module { + func.func @loop_outer_live(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + + // A buffer that remains live across the loop (used after the loop). + %outer = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%outer : !pto.tile_buf) + + // A loop-local buffer used inside the loop. + %inner = pto.alloc_tile : !pto.tile_buf + scf.for %i = %c0 to %c4 step %c1 { + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%inner : !pto.tile_buf) + pto.tstore ins(%inner : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + + // Use %outer after the loop to keep it live across the loop. + pto.tstore ins(%outer : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + return + } +} + +// Expect a loop, and two planned buffers at distinct offsets. +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_nested_loops.py b/test/samples/planmemory/plan_memory_nested_loops.py new file mode 100644 index 00000000..574c7019 --- /dev/null +++ b/test/samples/planmemory/plan_memory_nested_loops.py @@ -0,0 +1,40 @@ +PTO_IR = r""" + +module { + func.func @nested_loops(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + + // Outer buffer stays live across both loops (used after). + %outer = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%outer : !pto.tile_buf) + + scf.for %i = %c0 to %c2 step %c1 { + // Buffer allocated inside the outer loop. + %mid = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%mid : !pto.tile_buf) + scf.for %j = %c0 to %c2 step %c1 { + // A temp buffer in the inner loop. + %tmp = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%tmp : !pto.tile_buf) + pto.tstore ins(%tmp : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + pto.tstore ins(%mid : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + } + + pto.tstore ins(%outer : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + return + } +} +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_no_reuse_overlap.py b/test/samples/planmemory/plan_memory_no_reuse_overlap.py new file mode 100644 index 00000000..2bcabe67 --- /dev/null +++ b/test/samples/planmemory/plan_memory_no_reuse_overlap.py @@ -0,0 +1,27 @@ +PTO_IR = r""" + +module { + func.func @no_reuse_overlap(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %ub0 = pto.alloc_tile : !pto.tile_buf + %ub1 = pto.alloc_tile : !pto.tile_buf + + // Make lifetimes overlap by using both buffers after both are created. + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub0 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub1 : !pto.tile_buf) + pto.tstore ins(%ub0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%ub1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + return + } +} + +// With overlapping lifetimes, offsets must differ. +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_peak_8_overlapping.py b/test/samples/planmemory/plan_memory_peak_8_overlapping.py new file mode 100644 index 00000000..863aa4c0 --- /dev/null +++ b/test/samples/planmemory/plan_memory_peak_8_overlapping.py @@ -0,0 +1,57 @@ +PTO_IR = r""" + +module { + func.func @peak_8_overlapping(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + // Peak liveness: 8 buffers live at once. + %u0 = pto.alloc_tile : !pto.tile_buf + %u1 = pto.alloc_tile : !pto.tile_buf + %u2 = pto.alloc_tile : !pto.tile_buf + %u3 = pto.alloc_tile : !pto.tile_buf + %u4 = pto.alloc_tile : !pto.tile_buf + %u5 = pto.alloc_tile : !pto.tile_buf + %u6 = pto.alloc_tile : !pto.tile_buf + %u7 = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u0 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u1 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u2 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u3 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u4 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u5 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u6 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u7 : !pto.tile_buf) + + pto.tstore ins(%u0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u2 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u3 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u4 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u5 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u6 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u7 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + return + } +} + +// 8 live buffers implies a max offset of 7*8192 = 57344 bytes. +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_peak_exact_capacity.py b/test/samples/planmemory/plan_memory_peak_exact_capacity.py new file mode 100644 index 00000000..d8e260f7 --- /dev/null +++ b/test/samples/planmemory/plan_memory_peak_exact_capacity.py @@ -0,0 +1,139 @@ +PTO_IR = r""" + +module { + func.func @peak_exact_capacity(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + // Default UB size is 1572864 bits (196608 bytes). Each buffer here is + // 16*16*16*f16 = 8192 bytes. 24 buffers live at once should fit exactly: + // 24 * 8192 = 196608 bytes. + %u0 = pto.alloc_tile : !pto.tile_buf + %u1 = pto.alloc_tile : !pto.tile_buf + %u2 = pto.alloc_tile : !pto.tile_buf + %u3 = pto.alloc_tile : !pto.tile_buf + %u4 = pto.alloc_tile : !pto.tile_buf + %u5 = pto.alloc_tile : !pto.tile_buf + %u6 = pto.alloc_tile : !pto.tile_buf + %u7 = pto.alloc_tile : !pto.tile_buf + %u8 = pto.alloc_tile : !pto.tile_buf + %u9 = pto.alloc_tile : !pto.tile_buf + %u10 = pto.alloc_tile : !pto.tile_buf + %u11 = pto.alloc_tile : !pto.tile_buf + %u12 = pto.alloc_tile : !pto.tile_buf + %u13 = pto.alloc_tile : !pto.tile_buf + %u14 = pto.alloc_tile : !pto.tile_buf + %u15 = pto.alloc_tile : !pto.tile_buf + %u16 = pto.alloc_tile : !pto.tile_buf + %u17 = pto.alloc_tile : !pto.tile_buf + %u18 = pto.alloc_tile : !pto.tile_buf + %u19 = pto.alloc_tile : !pto.tile_buf + %u20 = pto.alloc_tile : !pto.tile_buf + %u21 = pto.alloc_tile : !pto.tile_buf + %u22 = pto.alloc_tile : !pto.tile_buf + %u23 = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u0 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u1 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u2 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u3 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u4 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u5 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u6 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u7 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u8 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u9 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u10 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u11 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u12 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u13 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u14 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u15 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u16 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u17 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u18 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u19 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u20 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u21 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u22 : !pto.tile_buf) + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%u23 : !pto.tile_buf) + + pto.tstore ins(%u0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u2 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u3 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u4 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u5 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u6 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u7 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u8 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u9 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u10 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u11 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u12 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u13 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u14 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u15 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u16 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u17 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u18 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u19 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u20 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u21 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u22 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + pto.tstore ins(%u23 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + return + } +} + +// 24 live buffers implies a max offset of 23*8192 = 188416 bytes. +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_reuse_sequential.py b/test/samples/planmemory/plan_memory_reuse_sequential.py new file mode 100644 index 00000000..1114f851 --- /dev/null +++ b/test/samples/planmemory/plan_memory_reuse_sequential.py @@ -0,0 +1,200 @@ +PTO_IR = r""" + +module { + func.func @reuse_sequential(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + // Force reuse: + // UB capacity (default) is 1572864 bits (196608 bytes). Each buffer here is + // 16*16*16*f16 = 8192 bytes. Allocating 30 such buffers exceeds UB capacity + // unless memory reuse is applied. + %ub0 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub0 : !pto.tile_buf) + pto.tstore ins(%ub0 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub1 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub1 : !pto.tile_buf) + pto.tstore ins(%ub1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub2 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub2 : !pto.tile_buf) + pto.tstore ins(%ub2 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub3 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub3 : !pto.tile_buf) + pto.tstore ins(%ub3 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub4 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub4 : !pto.tile_buf) + pto.tstore ins(%ub4 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub5 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub5 : !pto.tile_buf) + pto.tstore ins(%ub5 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub6 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub6 : !pto.tile_buf) + pto.tstore ins(%ub6 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub7 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub7 : !pto.tile_buf) + pto.tstore ins(%ub7 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub8 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub8 : !pto.tile_buf) + pto.tstore ins(%ub8 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub9 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub9 : !pto.tile_buf) + pto.tstore ins(%ub9 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub10 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub10 : !pto.tile_buf) + pto.tstore ins(%ub10 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub11 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub11 : !pto.tile_buf) + pto.tstore ins(%ub11 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub12 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub12 : !pto.tile_buf) + pto.tstore ins(%ub12 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub13 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub13 : !pto.tile_buf) + pto.tstore ins(%ub13 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub14 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub14 : !pto.tile_buf) + pto.tstore ins(%ub14 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub15 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub15 : !pto.tile_buf) + pto.tstore ins(%ub15 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub16 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub16 : !pto.tile_buf) + pto.tstore ins(%ub16 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub17 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub17 : !pto.tile_buf) + pto.tstore ins(%ub17 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub18 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub18 : !pto.tile_buf) + pto.tstore ins(%ub18 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub19 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub19 : !pto.tile_buf) + pto.tstore ins(%ub19 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub20 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub20 : !pto.tile_buf) + pto.tstore ins(%ub20 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub21 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub21 : !pto.tile_buf) + pto.tstore ins(%ub21 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub22 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub22 : !pto.tile_buf) + pto.tstore ins(%ub22 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub23 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub23 : !pto.tile_buf) + pto.tstore ins(%ub23 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub24 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub24 : !pto.tile_buf) + pto.tstore ins(%ub24 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub25 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub25 : !pto.tile_buf) + pto.tstore ins(%ub25 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub26 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub26 : !pto.tile_buf) + pto.tstore ins(%ub26 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub27 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub27 : !pto.tile_buf) + pto.tstore ins(%ub27 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub28 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub28 : !pto.tile_buf) + pto.tstore ins(%ub28 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + %ub29 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub29 : !pto.tile_buf) + pto.tstore ins(%ub29 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + return + } +} + +// Anchor checks after the PlanMemory debug marker (ptoas prints the module +// before and after planning). +// Expect at least two distinct allocations to reuse offset 0. +""" + +if __name__ == "__main__": + print(PTO_IR) diff --git a/test/samples/planmemory/plan_memory_scopes_independent.py b/test/samples/planmemory/plan_memory_scopes_independent.py new file mode 100644 index 00000000..4310d2d5 --- /dev/null +++ b/test/samples/planmemory/plan_memory_scopes_independent.py @@ -0,0 +1,26 @@ +PTO_IR = r""" + +module { + func.func @scopes_independent(%arg0: memref<16x256xf16, #pto.address_space>, + %arg1: memref<16x256xf16, #pto.address_space>) { + %ub = pto.alloc_tile : !pto.tile_buf + %l1 = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%ub : !pto.tile_buf) + pto.tstore ins(%ub : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + + pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) + outs(%l1 : !pto.tile_buf) + pto.tstore ins(%l1 : !pto.tile_buf) + outs(%arg1 : memref<16x256xf16, #pto.address_space>) + return + } +} + +// Offsets are planned per-scope, so both UB and L1 can start at 0. +""" + +if __name__ == "__main__": + print(PTO_IR) From 250d0a755d395bdfef5fa583be34a62030aa5751 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Thu, 12 Mar 2026 20:38:04 +0800 Subject: [PATCH 11/14] Switch to pto-arch mapping in npu validation --- lib/PTO/Transforms/PTOPlanMemory.cpp | 170 +++++++++--------- lib/PTO/Transforms/PTOPlanMemory.h | 3 + .../scripts/generate_testcase.py | 51 ++++-- .../scripts/run_remote_npu_validation.sh | 6 +- .../templates/run_sh_template.sh | 12 +- 5 files changed, 139 insertions(+), 103 deletions(-) diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp index 257410f2..69569043 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.cpp +++ b/lib/PTO/Transforms/PTOPlanMemory.cpp @@ -123,10 +123,6 @@ void MemLivenessAnalysis::RecursionIR(Region *region, Liveness live) { // return WalkResult::advance(); } else if (auto loadOp = dyn_cast(op)) { OpKillHandle(curOpInfo, live, op->getBlock()); - } else if (auto getValDpsOp = dyn_cast(op)) { - // GetValDpsOp only reads from buffer, similar to LoadOp. - (void)getValDpsOp; - OpKillHandle(curOpInfo, live, op->getBlock()); } else if (auto tprintOp = dyn_cast(op)) { // TPrintOp only reads from buffer, similar to LoadOp OpKillHandle(curOpInfo, live, op->getBlock()); @@ -700,6 +696,37 @@ void MemPlan::EmitPlanMemoryFailureInfo() { } } +bool MemPlan::RecordOverflowIfAny() { + if (!failApplyBufferInfo.empty()) { + return true; + } + if (planMode != MemPlanMode::LOCAL_MEM_PLAN || + memscope2rootStorageEntry.empty()) { + return false; + } + + for (auto &it : memscope2rootStorageEntry) { + auto *rootStorageEntry = it.second; + if (!rootStorageEntry) { + continue; + } + auto bufferSpaceInfo = + GetBufferSpaceInfo(rootStorageEntry->bufInfo->bufferScope); + size_t maxBits = bufferSpaceInfo.second; + uint64_t maxAllocBits = rootStorageEntry->alignedConstBits; + for (auto *child : rootStorageEntry->mergedChildren) { + maxAllocBits = + std::max(maxAllocBits, child->bitsOffset + child->alignedConstBits); + } + if (maxAllocBits > maxBits) { + failApplyBufferInfo[rootStorageEntry->bufInfo->bufferScope] = + maxAllocBits; + } + } + + return !failApplyBufferInfo.empty(); +} + // Plan Memory algorithm. LogicalResult MemPlan::plan() { // Construct StorageEntry structure. @@ -712,6 +739,10 @@ LogicalResult MemPlan::plan() { EmitPlanMemoryFailureInfo(); return failure(); } + if (RecordOverflowIfAny()) { + EmitPlanMemoryFailureInfo(); + return failure(); + } // Update the address information of each buffer after memory buffer. UpdateBuffer2Offsets(); if (enablePrintMemoryAllocatedSize) { @@ -756,7 +787,8 @@ void MemPlan::PrintSuccessfulAllocatedMaxBits() { for (auto& child : it->second->mergedChildren) { ubAllocBits = std::max(ubAllocBits, child->bitsOffset + child->alignedConstBits); } - llvm::outs() << "[AscendNPU IR] Allocated UB size = " << ubAllocBits << " bits "<< "\n"; + llvm::outs() << "[PTOPlanMemory] Allocated UB size = " << ubAllocBits + << " bits\n"; } } @@ -1756,88 +1788,66 @@ void MemPlan::ReportAllocatedEntryDebugInfo(StorageEntry *rootStorageEntry) { } LogicalResult MemPlan::InitMemSpecsFromModule(func::FuncOp funcOp) { - ubSpaceSize = 1572864; - l1SpaceSize = 4194304; - l0aSpaceSize = 524288; - l0bSpaceSize = 524288; - l0cSpaceSize = 1048576; - ubAlignSize = 256; - l1AlignSize = 256; - l0cAlignSize = 4096; - l0aAlignSize = 4096; - l0bAlignSize = 4096; - biasAlignSize = 256; - biasSpaceSize = 524288; - scalingAlignSize = 256; - scalingSpaceSize = 1572864; + struct MemSpec { + int ubSpaceSize; + int l1SpaceSize; + int l0aSpaceSize; + int l0bSpaceSize; + int l0cSpaceSize; + int ubAlignSize; + int l1AlignSize; + int l0cAlignSize; + int l0aAlignSize; + int l0bAlignSize; + int biasAlignSize; + int biasSpaceSize; + int scalingAlignSize; + int scalingSpaceSize; + }; - auto moduleOp = getTopLevelModuleOp(funcOp); - StringAttr strAttr = moduleOp->getAttrOfType("pto.device-spec"); - if (!strAttr) { - return success(); - } + const MemSpec kA3 = { + 1572864, 4194304, 524288, 524288, 1048576, 256, 256, + 4096, 4096, 4096, 256, 524288, 256, 1572864}; + const MemSpec kA5 = { + 2031616, 4194304, 524288, 524288, 2097152, 256, 256, + 4096, 4096, 4096, 256, 524288, 256, 2031616}; + + auto applySpec = [this](const MemSpec &spec) { + ubSpaceSize = spec.ubSpaceSize; + l1SpaceSize = spec.l1SpaceSize; + l0aSpaceSize = spec.l0aSpaceSize; + l0bSpaceSize = spec.l0bSpaceSize; + l0cSpaceSize = spec.l0cSpaceSize; + ubAlignSize = spec.ubAlignSize; + l1AlignSize = spec.l1AlignSize; + l0cAlignSize = spec.l0cAlignSize; + l0aAlignSize = spec.l0aAlignSize; + l0bAlignSize = spec.l0bAlignSize; + biasAlignSize = spec.biasAlignSize; + biasSpaceSize = spec.biasSpaceSize; + scalingAlignSize = spec.scalingAlignSize; + scalingSpaceSize = spec.scalingSpaceSize; + }; - if (strAttr.getValue().str() == "Ascend910B1" || - strAttr.getValue().str() == "Ascend910B2" || - strAttr.getValue().str() == "Ascend910B3" || - strAttr.getValue().str() == "Ascend910B4" || - strAttr.getValue().str() == "Ascend910_9362" || - strAttr.getValue().str() == "Ascend910_9372" || - strAttr.getValue().str() == "Ascend910_9381" || - strAttr.getValue().str() == "Ascend910_9382" || - strAttr.getValue().str() == "Ascend910_9391" || - strAttr.getValue().str() == "Ascend910_9392") { - return success(); - } + // Default to a3. + applySpec(kA3); - if (strAttr.getValue().str() == "Ascend310B1" || - strAttr.getValue().str() == "Ascend310B2" || - strAttr.getValue().str() == "Ascend310B3" || - strAttr.getValue().str() == "Ascend310B4") { - ubSpaceSize = 2097152; - l1SpaceSize = 8388608; - l0aSpaceSize = 524288; - l0bSpaceSize = 524288; - l0cSpaceSize = 1048576; - ubAlignSize = 256; - l1AlignSize = 256; - l0cAlignSize = 4096; - l0aAlignSize = 4096; - l0bAlignSize = 4096; - biasAlignSize = 256; - biasSpaceSize = 524288; - scalingAlignSize = 256; - scalingSpaceSize = 2097152; + auto moduleOp = getTopLevelModuleOp(funcOp); + StringAttr archAttr = moduleOp->getAttrOfType("pto.target_arch"); + if (!archAttr) { return success(); } - if (strAttr.getValue().str() == "Ascend910_950z" || - strAttr.getValue().str() == "Ascend910_9579" || - strAttr.getValue().str() == "Ascend910_957b" || - strAttr.getValue().str() == "Ascend910_957d" || - strAttr.getValue().str() == "Ascend910_950z" || - strAttr.getValue().str() == "Ascend910_9581" || - strAttr.getValue().str() == "Ascend910_9589" || - strAttr.getValue().str() == "Ascend910_958a" || - strAttr.getValue().str() == "Ascend910_958b" || - strAttr.getValue().str() == "Ascend910_9599") { - ubSpaceSize = 2031616; - l1SpaceSize = 4194304; - l0aSpaceSize = 524288; - l0bSpaceSize = 524288; - l0cSpaceSize = 2097152; - ubAlignSize = 256; - l1AlignSize = 256; - l0cAlignSize = 4096; - l0aAlignSize = 4096; - l0bAlignSize = 4096; - biasAlignSize = 256; - biasSpaceSize = 524288; - scalingAlignSize = 256; - scalingSpaceSize = 2031616; - return success(); + std::string arch = archAttr.getValue().str(); + for (char &c : arch) + c = static_cast(std::tolower(static_cast(c))); + + // --pto-arch options: + // a3 -> default memory spec + // a5 -> override memory spec + if (arch == "a5") { + applySpec(kA5); } - return success(); } diff --git a/lib/PTO/Transforms/PTOPlanMemory.h b/lib/PTO/Transforms/PTOPlanMemory.h index e7cff4f7..6089087c 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.h +++ b/lib/PTO/Transforms/PTOPlanMemory.h @@ -466,6 +466,9 @@ class MemPlan { /// Print successful memory alloc. void PrintSuccessfulAllocatedMaxBits(); + /// Post-plan sanity check for local memory overflow. + bool RecordOverflowIfAny(); + /// Prepare the memref.alloc plan. PlanStatus PlanLocalMemAddress(); diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 3ee008f0..556824b6 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -332,11 +332,11 @@ def _inject_packed_pred_mask_preload( return kernel_text[:insert_at] + block + kernel_text[insert_at:] -def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: +def _infer_aicore_arch(kernel_text: str, soc_version: str) -> str: # Heuristic: kernels that touch cube/L0/L1 tile types or cbuf memories need # the "cube" arch; pure vector kernels can use the vector arch. # - # IMPORTANT: the default arch depends on the target architecture. + # IMPORTANT: the default arch depends on the Ascend SoC. cube_markers = ( "TileType::Mat", "TileType::Left", @@ -354,18 +354,28 @@ def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: ) needs_cube = any(m in kernel_text for m in cube_markers) - arch = (pto_arch or "").strip().lower() - if arch == "a5": - # A5 uses A5 instruction set. pto-isa examples build A5 kernels with - # dav-c310-{vec|cube}. + sv = (soc_version or "").lower() + if "950" in sv or "a5" in sv: + # Ascend950 (A5) uses A5 instruction set. pto-isa examples build A5 + # kernels with dav-c310-{vec|cube}. return "dav-c310-cube" if needs_cube else "dav-c310-vec" - if arch == "a3": + if "910b" in sv: + # Ascend910B* (e.g. Ascend910B1) uses dav-c310 toolchain arch. + return "dav-c310-cube" if needs_cube else "dav-c310-vec" + if "a3" in sv: # A2/A3 uses dav-c220 toolchain arch. return "dav-c220-cube" if needs_cube else "dav-c220-vec" - # Default to Ascend910 (dav-c220) when arch is unknown. + # Default to Ascend910 (dav-c220) when SoC is unknown. return "dav-c220-cube" if needs_cube else "dav-c220-vec" +def _soc_version_for_arch(arch: Optional[str]) -> str: + a = (arch or "").strip().lower() + if a == "a5": + return "Ascend910_9599" + return "Ascend910B1" + + def _parse_int_list(blob: str): items = [] for part in blob.split(","): @@ -828,6 +838,7 @@ def generate_testcase( has_dav_cube = "__DAV_CUBE__" in raw_kernel has_dav_vec = "__DAV_VEC__" in raw_kernel + soc_version = _soc_version_for_arch(pto_arch) if aicore_arch is None: # Sectioned kernels contain `#if defined(__DAV_CUBE__)` / `__DAV_VEC__` # blocks. They frequently rely on cross-section synchronization (e.g. @@ -836,15 +847,15 @@ def generate_testcase( # may be unavailable; build with a vector arch and explicitly enable the # section macros instead. if has_dav_cube or has_dav_vec: - arch = (pto_arch or "").strip().lower() - if arch == "a5": + sv = (soc_version or "").lower() + if "950" in sv or "a5" in sv: + aicore_arch = "dav-c310-vec" + elif "910b" in sv: aicore_arch = "dav-c310-vec" - elif arch == "a3": - aicore_arch = "dav-c220-vec" else: aicore_arch = "dav-c220-vec" else: - aicore_arch = _infer_aicore_arch(raw_kernel, pto_arch) + aicore_arch = _infer_aicore_arch(raw_kernel, soc_version) # Force-define DAV section macros so both sections are compiled into the # same binary. This keeps the generated validation executable self-contained @@ -1194,10 +1205,10 @@ def generate_testcase( (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8") # pto-isa selects instruction implementations based on MEMORY_BASE vs - # REGISTER_BASE. A5 uses REGISTER_BASE. + # REGISTER_BASE. A5 and Ascend910B use REGISTER_BASE. mem_base_define = "MEMORY_BASE" - arch = (pto_arch or "").strip().lower() - if arch == "a5": + sv = (soc_version or "").lower() + if "910b" in sv or "950" in sv or "a5" in sv: mem_base_define = "REGISTER_BASE" # CCE printing support is gated behind `--cce-enable-print` on some bisheng @@ -1225,7 +1236,7 @@ def generate_testcase( set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) if(NOT DEFINED SOC_VERSION) - set(SOC_VERSION Ascend910) + set(SOC_VERSION Ascend910B1) endif() option(ENABLE_SIM_GOLDEN "Build Ascend simulator (camodel) executable" ON) @@ -1389,11 +1400,13 @@ def generate_testcase( encoding="utf-8", ) - arch_for_runsh = pto_arch or "Ascend910" + arch_for_runsh = (pto_arch or "a3").strip().lower() + soc_for_runsh = _soc_version_for_arch(arch_for_runsh) run_sh = (templates_root / "run_sh_template.sh").read_text(encoding="utf-8") run_sh = run_sh.replace("@EXECUTABLE@", testcase) run_sh = run_sh.replace("@RUN_MODE@", run_mode) - run_sh = run_sh.replace("@SOC_VERSION@", arch_for_runsh) + run_sh = run_sh.replace("@PTO_ARCH@", arch_for_runsh) + run_sh = run_sh.replace("@SOC_VERSION@", soc_for_runsh) run_path = output_dir / "run.sh" run_path.write_text(run_sh, encoding="utf-8") run_path.chmod(0o755) diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 8a6574e3..0a9a56c2 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -123,10 +123,10 @@ export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" pto_arch_lc="$(printf '%s' "${PTO_ARCH}" | tr '[:upper:]' '[:lower:]')" case "${pto_arch_lc}" in - a5) SOC_VERSION="Ascend910_95" ;; - a3) SOC_VERSION="Ascend910B" ;; + a5) SOC_VERSION="Ascend910_9599" ;; + a3) SOC_VERSION="Ascend910B1" ;; *) - SOC_VERSION="Ascend910B" + SOC_VERSION="Ascend910B1" pto_arch_lc="a3" ;; esac diff --git a/test/npu_validation/templates/run_sh_template.sh b/test/npu_validation/templates/run_sh_template.sh index 5c31a286..753a3f36 100644 --- a/test/npu_validation/templates/run_sh_template.sh +++ b/test/npu_validation/templates/run_sh_template.sh @@ -2,7 +2,17 @@ set -euo pipefail RUN_MODE="@RUN_MODE@" -SOC_VERSION="@SOC_VERSION@" +PTO_ARCH="${PTO_ARCH:-@PTO_ARCH@}" +SOC_VERSION="${SOC_VERSION:-@SOC_VERSION@}" +if [[ -z "${PTO_ARCH}" || "${PTO_ARCH}" == "@PTO_ARCH@" ]]; then + PTO_ARCH="a3" +fi +if [[ -z "${SOC_VERSION}" || "${SOC_VERSION}" == "@SOC_VERSION@" ]]; then + case "${PTO_ARCH,,}" in + a5) SOC_VERSION="Ascend910_9599" ;; + *) SOC_VERSION="Ascend910B1" ;; + esac +fi GOLDEN_MODE="${GOLDEN_MODE:-npu}" # sim|npu|skip BUILD_DIR="${BUILD_DIR:-build}" From d87f32cc5dc71266eb243d3ff4f6a1de47a9c661 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Thu, 12 Mar 2026 20:42:35 +0800 Subject: [PATCH 12/14] Use pto-arch only for npu validation --- .../scripts/generate_testcase.py | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index 556824b6..f447211d 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -332,11 +332,11 @@ def _inject_packed_pred_mask_preload( return kernel_text[:insert_at] + block + kernel_text[insert_at:] -def _infer_aicore_arch(kernel_text: str, soc_version: str) -> str: +def _infer_aicore_arch(kernel_text: str, pto_arch: Optional[str]) -> str: # Heuristic: kernels that touch cube/L0/L1 tile types or cbuf memories need # the "cube" arch; pure vector kernels can use the vector arch. # - # IMPORTANT: the default arch depends on the Ascend SoC. + # IMPORTANT: the default arch depends on the target architecture. cube_markers = ( "TileType::Mat", "TileType::Left", @@ -354,18 +354,15 @@ def _infer_aicore_arch(kernel_text: str, soc_version: str) -> str: ) needs_cube = any(m in kernel_text for m in cube_markers) - sv = (soc_version or "").lower() - if "950" in sv or "a5" in sv: - # Ascend950 (A5) uses A5 instruction set. pto-isa examples build A5 - # kernels with dav-c310-{vec|cube}. + arch = (pto_arch or "").strip().lower() + if arch == "a5": + # A5 uses A5 instruction set. pto-isa examples build A5 kernels with + # dav-c310-{vec|cube}. return "dav-c310-cube" if needs_cube else "dav-c310-vec" - if "910b" in sv: - # Ascend910B* (e.g. Ascend910B1) uses dav-c310 toolchain arch. - return "dav-c310-cube" if needs_cube else "dav-c310-vec" - if "a3" in sv: + if arch == "a3": # A2/A3 uses dav-c220 toolchain arch. return "dav-c220-cube" if needs_cube else "dav-c220-vec" - # Default to Ascend910 (dav-c220) when SoC is unknown. + # Default to Ascend910 (dav-c220) when arch is unknown. return "dav-c220-cube" if needs_cube else "dav-c220-vec" @@ -838,7 +835,6 @@ def generate_testcase( has_dav_cube = "__DAV_CUBE__" in raw_kernel has_dav_vec = "__DAV_VEC__" in raw_kernel - soc_version = _soc_version_for_arch(pto_arch) if aicore_arch is None: # Sectioned kernels contain `#if defined(__DAV_CUBE__)` / `__DAV_VEC__` # blocks. They frequently rely on cross-section synchronization (e.g. @@ -847,15 +843,15 @@ def generate_testcase( # may be unavailable; build with a vector arch and explicitly enable the # section macros instead. if has_dav_cube or has_dav_vec: - sv = (soc_version or "").lower() - if "950" in sv or "a5" in sv: - aicore_arch = "dav-c310-vec" - elif "910b" in sv: + arch = (pto_arch or "").strip().lower() + if arch == "a5": aicore_arch = "dav-c310-vec" + elif arch == "a3": + aicore_arch = "dav-c220-vec" else: aicore_arch = "dav-c220-vec" else: - aicore_arch = _infer_aicore_arch(raw_kernel, soc_version) + aicore_arch = _infer_aicore_arch(raw_kernel, pto_arch) # Force-define DAV section macros so both sections are compiled into the # same binary. This keeps the generated validation executable self-contained @@ -1205,10 +1201,10 @@ def generate_testcase( (output_dir / "launch.cpp").write_text(launch_cpp, encoding="utf-8") # pto-isa selects instruction implementations based on MEMORY_BASE vs - # REGISTER_BASE. A5 and Ascend910B use REGISTER_BASE. + # REGISTER_BASE. A5 uses REGISTER_BASE. mem_base_define = "MEMORY_BASE" - sv = (soc_version or "").lower() - if "910b" in sv or "950" in sv or "a5" in sv: + arch = (pto_arch or "").strip().lower() + if arch == "a5": mem_base_define = "REGISTER_BASE" # CCE printing support is gated behind `--cce-enable-print` on some bisheng From bd7a34baa61909aac95304e8acf18d8edb112176 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Thu, 12 Mar 2026 20:49:04 +0800 Subject: [PATCH 13/14] Remove soc_version from npu validation flow --- .../scripts/generate_testcase.py | 18 ++++++----- .../scripts/run_remote_npu_validation.sh | 27 +++++----------- .../templates/run_sh_template.sh | 31 ++++++------------- 3 files changed, 29 insertions(+), 47 deletions(-) diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py index f447211d..a149e090 100644 --- a/test/npu_validation/scripts/generate_testcase.py +++ b/test/npu_validation/scripts/generate_testcase.py @@ -1231,8 +1231,14 @@ def generate_testcase( set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) -if(NOT DEFINED SOC_VERSION) - set(SOC_VERSION Ascend910B1) +if(NOT DEFINED PTO_ARCH) + set(PTO_ARCH a3) +endif() +string(TOLOWER "${PTO_ARCH}" PTO_ARCH_LC) +if(PTO_ARCH_LC STREQUAL "a5") + set(SIM_SOC_DIR Ascend910_9599) +else() + set(SIM_SOC_DIR Ascend910B1) endif() option(ENABLE_SIM_GOLDEN "Build Ascend simulator (camodel) executable" ON) @@ -1339,9 +1345,9 @@ def generate_testcase( ) target_link_directories({testcase}_sim PUBLIC ${{ASCEND_HOME_PATH}}/lib64 - ${{ASCEND_HOME_PATH}}/aarch64-linux/simulator/${{SOC_VERSION}}/lib - ${{ASCEND_HOME_PATH}}/simulator/${{SOC_VERSION}}/lib - ${{ASCEND_HOME_PATH}}/tools/simulator/${{SOC_VERSION}}/lib + ${{ASCEND_HOME_PATH}}/aarch64-linux/simulator/${{SIM_SOC_DIR}}/lib + ${{ASCEND_HOME_PATH}}/simulator/${{SIM_SOC_DIR}}/lib + ${{ASCEND_HOME_PATH}}/tools/simulator/${{SIM_SOC_DIR}}/lib ) target_link_libraries({testcase}_sim PRIVATE {testcase}_kernel @@ -1397,12 +1403,10 @@ def generate_testcase( ) arch_for_runsh = (pto_arch or "a3").strip().lower() - soc_for_runsh = _soc_version_for_arch(arch_for_runsh) run_sh = (templates_root / "run_sh_template.sh").read_text(encoding="utf-8") run_sh = run_sh.replace("@EXECUTABLE@", testcase) run_sh = run_sh.replace("@RUN_MODE@", run_mode) run_sh = run_sh.replace("@PTO_ARCH@", arch_for_runsh) - run_sh = run_sh.replace("@SOC_VERSION@", soc_for_runsh) run_path = output_dir / "run.sh" run_path.write_text(run_sh, encoding="utf-8") run_path.chmod(0o755) diff --git a/test/npu_validation/scripts/run_remote_npu_validation.sh b/test/npu_validation/scripts/run_remote_npu_validation.sh index 0a9a56c2..0a9c1e24 100644 --- a/test/npu_validation/scripts/run_remote_npu_validation.sh +++ b/test/npu_validation/scripts/run_remote_npu_validation.sh @@ -123,32 +123,21 @@ export LD_LIBRARY_PATH="${ASCEND_HOME_PATH}/lib64:${LD_LIBRARY_PATH:-}" pto_arch_lc="$(printf '%s' "${PTO_ARCH}" | tr '[:upper:]' '[:lower:]')" case "${pto_arch_lc}" in - a5) SOC_VERSION="Ascend910_9599" ;; - a3) SOC_VERSION="Ascend910B1" ;; + a5) SIM_SOC_DIR="Ascend910_9599" ;; + a3) SIM_SOC_DIR="Ascend910B1" ;; *) - SOC_VERSION="Ascend910B1" + SIM_SOC_DIR="Ascend910B1" pto_arch_lc="a3" ;; esac - -SIM_SOC_VERSION="${SOC_VERSION}" -# Some CANN installs do not provide a simulator directory named exactly -# "Ascend910". Map it to a real directory so we can link/run camodel. -if [[ "${SOC_VERSION}" == "Ascend910" ]]; then - if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then - SIM_SOC_VERSION="Ascend910A" - elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then - SIM_SOC_VERSION="Ascend910ProA" - fi -fi -log "SIM_SOC_VERSION=${SIM_SOC_VERSION}" +log "SIM_SOC_DIR=${SIM_SOC_DIR}" LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH}" LD_LIBRARY_PATH_SIM="${LD_LIBRARY_PATH}" for d in \ - "${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_VERSION}/lib" \ - "${ASCEND_HOME_PATH}/simulator/${SIM_SOC_VERSION}/lib" \ - "${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_VERSION}/lib"; do + "${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_DIR}/lib" \ + "${ASCEND_HOME_PATH}/simulator/${SIM_SOC_DIR}/lib" \ + "${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_DIR}/lib"; do [[ -d "$d" ]] && LD_LIBRARY_PATH_SIM="$d:${LD_LIBRARY_PATH_SIM}" done @@ -246,7 +235,7 @@ while IFS= read -r -d '' cpp; do enable_sim_golden="OFF" [[ "${GOLDEN_MODE}" == "sim" ]] && enable_sim_golden="ON" cmake -S . -B ./build \ - -DSOC_VERSION="${SIM_SOC_VERSION}" \ + -DPTO_ARCH="${PTO_ARCH}" \ -DENABLE_SIM_GOLDEN="${enable_sim_golden}" \ -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" cmake --build ./build --parallel diff --git a/test/npu_validation/templates/run_sh_template.sh b/test/npu_validation/templates/run_sh_template.sh index 753a3f36..7e9597f6 100644 --- a/test/npu_validation/templates/run_sh_template.sh +++ b/test/npu_validation/templates/run_sh_template.sh @@ -3,16 +3,14 @@ set -euo pipefail RUN_MODE="@RUN_MODE@" PTO_ARCH="${PTO_ARCH:-@PTO_ARCH@}" -SOC_VERSION="${SOC_VERSION:-@SOC_VERSION@}" if [[ -z "${PTO_ARCH}" || "${PTO_ARCH}" == "@PTO_ARCH@" ]]; then PTO_ARCH="a3" fi -if [[ -z "${SOC_VERSION}" || "${SOC_VERSION}" == "@SOC_VERSION@" ]]; then - case "${PTO_ARCH,,}" in - a5) SOC_VERSION="Ascend910_9599" ;; - *) SOC_VERSION="Ascend910B1" ;; - esac -fi + +case "${PTO_ARCH,,}" in + a5) SIM_SOC_DIR="Ascend910_9599" ;; + *) SIM_SOC_DIR="Ascend910B1" ;; +esac GOLDEN_MODE="${GOLDEN_MODE:-npu}" # sim|npu|skip BUILD_DIR="${BUILD_DIR:-build}" @@ -66,19 +64,10 @@ fi LD_LIBRARY_PATH_NPU="${LD_LIBRARY_PATH:-}" LD_LIBRARY_PATH_SIM="${LD_LIBRARY_PATH_NPU}" if [[ -n "${ASCEND_HOME_PATH:-}" ]]; then - SIM_SOC_VERSION="${SOC_VERSION}" - if [[ "${SOC_VERSION}" == "Ascend910" ]]; then - if [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910A/lib" ]]; then - SIM_SOC_VERSION="Ascend910A" - elif [[ -d "${ASCEND_HOME_PATH}/aarch64-linux/simulator/Ascend910ProA/lib" ]]; then - SIM_SOC_VERSION="Ascend910ProA" - fi - fi - for d in \ - "${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_VERSION}/lib" \ - "${ASCEND_HOME_PATH}/simulator/${SIM_SOC_VERSION}/lib" \ - "${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_VERSION}/lib"; do + "${ASCEND_HOME_PATH}/aarch64-linux/simulator/${SIM_SOC_DIR}/lib" \ + "${ASCEND_HOME_PATH}/simulator/${SIM_SOC_DIR}/lib" \ + "${ASCEND_HOME_PATH}/tools/simulator/${SIM_SOC_DIR}/lib"; do [[ -d "$d" ]] && LD_LIBRARY_PATH_SIM="$d:${LD_LIBRARY_PATH_SIM}" done fi @@ -88,9 +77,9 @@ cd "${ROOT_DIR}/${BUILD_DIR}" ENABLE_SIM_GOLDEN="OFF" [[ "${GOLDEN_MODE}" == "sim" ]] && ENABLE_SIM_GOLDEN="ON" if [[ -n "${PTO_ISA_ROOT:-}" ]]; then - cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" .. + cmake -DPTO_ARCH="${PTO_ARCH}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" -DPTO_ISA_ROOT="${PTO_ISA_ROOT}" .. else - cmake -DSOC_VERSION="${SIM_SOC_VERSION:-${SOC_VERSION}}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" .. + cmake -DPTO_ARCH="${PTO_ARCH}" -DENABLE_SIM_GOLDEN="${ENABLE_SIM_GOLDEN}" .. fi make -j From a8c6755710a74d6b89243ffffa6e2316a689d6b8 Mon Sep 17 00:00:00 2001 From: PTOAS Date: Fri, 13 Mar 2026 10:49:48 +0800 Subject: [PATCH 14/14] Remove planmemory scopes test and align samples to target_arch --- .../basic/plan_memory_scopes_independent.mlir | 29 ------------------- .../Matmul_transpose-pto-ir.pto | 2 +- .../Matmul_transpose/Matmul_transpose.py | 2 +- .../samples/Sync/test_if_else_tile_result.pto | 2 +- .../plan_memory_scopes_independent.py | 26 ----------------- 5 files changed, 3 insertions(+), 58 deletions(-) delete mode 100644 test/basic/plan_memory_scopes_independent.mlir delete mode 100644 test/samples/planmemory/plan_memory_scopes_independent.py diff --git a/test/basic/plan_memory_scopes_independent.mlir b/test/basic/plan_memory_scopes_independent.mlir deleted file mode 100644 index b95b8c7e..00000000 --- a/test/basic/plan_memory_scopes_independent.mlir +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: ptoas %s 2>&1 1>/dev/null | FileCheck %s - -module { - func.func @scopes_independent(%arg0: memref<16x16x16xf16, #pto.address_space>, - %arg1: memref<16x16x16xf16, #pto.address_space>) { - %ub = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - %l1 = memref.alloc() : memref<16x16x16xf16, #pto.address_space> - - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%ub : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%ub : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - - pto.tload ins(%arg0 : memref<16x16x16xf16, #pto.address_space>) - outs(%l1 : memref<16x16x16xf16, #pto.address_space>) - pto.tstore ins(%l1 : memref<16x16x16xf16, #pto.address_space>) - outs(%arg1 : memref<16x16x16xf16, #pto.address_space>) - return - } -} - -// CHECK: end PTO plan Mem! -// CHECK: func.func @scopes_independent -// CHECK-NOT: memref.alloc -// Offsets are planned per-scope, so both UB and L1 can start at 0. -// CHECK-DAG: %c0_i64 = arith.constant 0 : i64 -// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space<{{vec|ub}}>> -// CHECK-DAG: pto.pointer_cast(%c0_i64) : memref<16x16x16xf16, #pto.address_space> - diff --git a/test/samples/Matmul_transpose/Matmul_transpose-pto-ir.pto b/test/samples/Matmul_transpose/Matmul_transpose-pto-ir.pto index 65848d08..0aa22e11 100644 --- a/test/samples/Matmul_transpose/Matmul_transpose-pto-ir.pto +++ b/test/samples/Matmul_transpose/Matmul_transpose-pto-ir.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @RunTEXTRACT(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: i1, %arg4: i1) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/test/samples/Matmul_transpose/Matmul_transpose.py b/test/samples/Matmul_transpose/Matmul_transpose.py index adb843c1..14014211 100644 --- a/test/samples/Matmul_transpose/Matmul_transpose.py +++ b/test/samples/Matmul_transpose/Matmul_transpose.py @@ -41,7 +41,7 @@ def build( pto.register_dialect(ctx, load=True) module = builtin.ModuleOp() - module.attributes["pto.device-spec"] = StringAttr.get("Ascend910B1") + module.attributes["pto.target_arch"] = StringAttr.get("a3") t_out = F32Type.get() t_a = F32Type.get() diff --git a/test/samples/Sync/test_if_else_tile_result.pto b/test/samples/Sync/test_if_else_tile_result.pto index 04a05062..58eca83d 100644 --- a/test/samples/Sync/test_if_else_tile_result.pto +++ b/test/samples/Sync/test_if_else_tile_result.pto @@ -1,4 +1,4 @@ -module attributes {"pto.device-spec" = "Ascend910B1"} { +module attributes {"pto.target_arch" = "a3"} { func.func @test_if_else_tile_result(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: i32, %arg3: !pto.ptr) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/test/samples/planmemory/plan_memory_scopes_independent.py b/test/samples/planmemory/plan_memory_scopes_independent.py deleted file mode 100644 index 4310d2d5..00000000 --- a/test/samples/planmemory/plan_memory_scopes_independent.py +++ /dev/null @@ -1,26 +0,0 @@ -PTO_IR = r""" - -module { - func.func @scopes_independent(%arg0: memref<16x256xf16, #pto.address_space>, - %arg1: memref<16x256xf16, #pto.address_space>) { - %ub = pto.alloc_tile : !pto.tile_buf - %l1 = pto.alloc_tile : !pto.tile_buf - - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%ub : !pto.tile_buf) - pto.tstore ins(%ub : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - - pto.tload ins(%arg0 : memref<16x256xf16, #pto.address_space>) - outs(%l1 : !pto.tile_buf) - pto.tstore ins(%l1 : !pto.tile_buf) - outs(%arg1 : memref<16x256xf16, #pto.address_space>) - return - } -} - -// Offsets are planned per-scope, so both UB and L1 can start at 0. -""" - -if __name__ == "__main__": - print(PTO_IR)