From 0cd820de00af12926a3be8adcac4dcca2259f443 Mon Sep 17 00:00:00 2001 From: zhangqi-chen Date: Tue, 10 Feb 2026 20:08:08 +0800 Subject: [PATCH] feat(codegen): Integrate PTOAS package into PTO backend compilation pipeline Add ptoas as a project dependency and invoke it automatically to compile .pto MLIR files to C++ when using the PTO backend. Update 3 pto mlir. --- pyproject.toml | 2 +- python/pypto/ir/compile.py | 40 +++++++++++++++++++++++++++++- reference/pto-isa/addc-pto-ir.mlir | 34 ++++++++++++------------- reference/pto-isa/adds-pto-ir.mlir | 20 +++++++-------- reference/pto-isa/mul-pto-ir.mlir | 26 +++++++++---------- 5 files changed, 80 insertions(+), 42 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9c8f8600..a3eedae9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ readme = { file = "README.md", content-type = "text/markdown" } license = "LicenseRef-CANN-Open-Software-License-Agreement-Version-2.0" license-files = ["LICENSE"] requires-python = ">=3.9" -dependencies = [] +dependencies = ["ptoas"] keywords = ["python", "pto", "tile", "tile-centric"] classifiers = [ "Development Status :: 4 - Beta", diff --git a/python/pypto/ir/compile.py b/python/pypto/ir/compile.py index 5240b4a6..ffffd612 100644 --- a/python/pypto/ir/compile.py +++ b/python/pypto/ir/compile.py @@ -10,6 +10,8 @@ """High-level API functions for PyPTO IR compilation.""" import os +import shutil +import subprocess from datetime import datetime from typing import Optional @@ -21,6 +23,38 @@ from .pass_manager import OptimizationStrategy, PassManager +def _run_ptoas( + pto_path: str, + output_path: str, + ptoas_flags: Optional[list[str]] = None, +) -> None: + """Run the ptoas tool to compile a .pto file to C++. + Requires the ``ptoas`` package (``pip install ptoas``). + + Args: + pto_path: Path to the input .pto file + output_path: Path for the output .cpp file + ptoas_flags: Additional flags to pass to ptoas (optional) + + Raises: + FileNotFoundError: If the ptoas binary is not found in PATH + RuntimeError: If ptoas compilation fails + """ + resolved_bin = shutil.which("ptoas") + if not resolved_bin: + raise FileNotFoundError( + "ptoas binary not found in PATH. Please install the ptoas package: pip install ptoas" + ) + + cmd = [resolved_bin, pto_path, "-o", output_path] + if ptoas_flags: + cmd.extend(ptoas_flags) + + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + if result.returncode != 0: + raise RuntimeError(f"ptoas compilation failed: {result.stderr.strip()}") + + def compile( program: _ir_core.Program, output_dir: Optional[str] = None, @@ -34,7 +68,8 @@ def compile( 1. Runs optimization passes via PassManager 2. Optionally dumps IR before and after each pass (if dump_passes=True) 3. Generates code via selected backend (PTO or CCE) - 4. Saves all artifacts to a unified output directory + 4. For PTO backend, optionally invokes ptoas to compile .pto to .cpp + 5. Saves all artifacts to a unified output directory Args: program: Input Program to compile @@ -76,6 +111,9 @@ def compile( pto_path = os.path.join(output_dir, "output.pto") with open(pto_path, "w") as f: f.write(pto_code) + # Run ptoas with --enable-insert-sync + cpp_path = os.path.join(output_dir, "output.cpp") + _run_ptoas(pto_path, cpp_path, ptoas_flags=["--enable-insert-sync"]) elif backend_type == BackendType.CCE: codegen_instance = _codegen_core.CCECodegen() files = codegen_instance.generate(transformed_program) # type: ignore[arg-type] diff --git a/reference/pto-isa/addc-pto-ir.mlir b/reference/pto-isa/addc-pto-ir.mlir index 3609c4a9..819a6243 100644 --- a/reference/pto-isa/addc-pto-ir.mlir +++ b/reference/pto-isa/addc-pto-ir.mlir @@ -3,23 +3,23 @@ module { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index - %0 = pto.make_tensor_view %arg0, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %1 = pto.make_tensor_view %arg1, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %2 = pto.make_tensor_view %arg2, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %3 = pto.make_tensor_view %arg3, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %4 = pto.subview %0, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - %5 = pto.subview %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - %6 = pto.subview %2, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - %7 = pto.alloc_tile : - %8 = pto.alloc_tile : - %9 = pto.alloc_tile : - %10 = pto.alloc_tile : - pto.tload ins(%4 : !pto.tile_view<32x32xf32>) outs(%7 : !pto.tile_buf) - pto.tload ins(%5 : !pto.tile_view<32x32xf32>) outs(%8 : !pto.tile_buf) - pto.tload ins(%6 : !pto.tile_view<32x32xf32>) outs(%9 : !pto.tile_buf) - pto.taddc ins(%7, %8, %9 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%10 : !pto.tile_buf) - %11 = pto.subview %3, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - pto.tstore ins(%10 : !pto.tile_buf) outs(%11 : !pto.tile_view<32x32xf32>) + %0 = pto.make_tensor_view %arg0, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<32x32xf32> + %1 = pto.make_tensor_view %arg1, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<32x32xf32> + %2 = pto.make_tensor_view %arg2, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<32x32xf32> + %3 = pto.make_tensor_view %arg3, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<32x32xf32> + %4 = pto.partition_view %0, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<32x32xf32> -> !pto.partition_tensor_view<32x32xf32> + %5 = pto.partition_view %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<32x32xf32> -> !pto.partition_tensor_view<32x32xf32> + %6 = pto.partition_view %2, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<32x32xf32> -> !pto.partition_tensor_view<32x32xf32> + %7 = pto.alloc_tile : !pto.tile_buf + %8 = pto.alloc_tile : !pto.tile_buf + %9 = pto.alloc_tile : !pto.tile_buf + %10 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%4 : !pto.partition_tensor_view<32x32xf32>) outs(%7 : !pto.tile_buf) + pto.tload ins(%5 : !pto.partition_tensor_view<32x32xf32>) outs(%8 : !pto.tile_buf) + pto.tload ins(%6 : !pto.partition_tensor_view<32x32xf32>) outs(%9 : !pto.tile_buf) + pto.taddc ins(%7, %8, %9 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf) outs(%10 : !pto.tile_buf) + %11 = pto.partition_view %3, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<32x32xf32> -> !pto.partition_tensor_view<32x32xf32> + pto.tstore ins(%10 : !pto.tile_buf) outs(%11 : !pto.partition_tensor_view<32x32xf32>) return } } diff --git a/reference/pto-isa/adds-pto-ir.mlir b/reference/pto-isa/adds-pto-ir.mlir index ef99f081..cb33dc8e 100644 --- a/reference/pto-isa/adds-pto-ir.mlir +++ b/reference/pto-isa/adds-pto-ir.mlir @@ -4,16 +4,16 @@ module { %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index %cst = arith.constant 3.140000e+00 : f32 - %0 = pto.make_tensor_view %arg0, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %1 = pto.make_tensor_view %arg1, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %2 = pto.subview %0, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - %3 = pto.subview %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - %4 = pto.alloc_tile : - %5 = pto.alloc_tile : - pto.tload ins(%2 : !pto.tile_view<32x32xf32>) outs(%4 : !pto.tile_buf) - pto.tadds ins(%4, %cst : !pto.tile_buf, f32) outs(%5 : !pto.tile_buf) - %6 = pto.subview %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - pto.tstore ins(%5 : !pto.tile_buf) outs(%6 : !pto.tile_view<32x32xf32>) + %0 = pto.make_tensor_view %arg0, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view + %1 = pto.make_tensor_view %arg1, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view + %2 = pto.partition_view %0, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %3 = pto.partition_view %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %4 = pto.alloc_tile : !pto.tile_buf + %5 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%2 : !pto.partition_tensor_view<32x32xf32>) outs(%4 : !pto.tile_buf) + pto.tadds ins(%4, %cst : !pto.tile_buf, f32) outs(%5 : !pto.tile_buf) + %6 = pto.partition_view %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + pto.tstore ins(%5 : !pto.tile_buf) outs(%6 : !pto.partition_tensor_view<32x32xf32>) return } } diff --git a/reference/pto-isa/mul-pto-ir.mlir b/reference/pto-isa/mul-pto-ir.mlir index 7016d1e9..5f32c358 100644 --- a/reference/pto-isa/mul-pto-ir.mlir +++ b/reference/pto-isa/mul-pto-ir.mlir @@ -3,19 +3,19 @@ module { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index - %0 = pto.make_tensor_view %arg0, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %1 = pto.make_tensor_view %arg1, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %2 = pto.make_tensor_view %arg2, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view<2xf32> - %3 = pto.subview %0, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - %4 = pto.subview %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - %5 = pto.alloc_tile : - %6 = pto.alloc_tile : - %7 = pto.alloc_tile : - pto.tload ins(%3 : !pto.tile_view<32x32xf32>) outs(%5 : !pto.tile_buf) - pto.tload ins(%4 : !pto.tile_view<32x32xf32>) outs(%6 : !pto.tile_buf) - pto.tmul ins(%5 : !pto.tile_buf, %6 : !pto.tile_buf) outs(%7 : !pto.tile_buf) - %8 = pto.subview %2, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32> - pto.tstore ins(%7 : !pto.tile_buf) outs(%8 : !pto.tile_view<32x32xf32>) + %0 = pto.make_tensor_view %arg0, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view + %1 = pto.make_tensor_view %arg1, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view + %2 = pto.make_tensor_view %arg2, shape = [%c32, %c32] strides = [%c32, %c1] : !pto.tensor_view + %3 = pto.partition_view %0, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %4 = pto.partition_view %1, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + %5 = pto.alloc_tile : !pto.tile_buf + %6 = pto.alloc_tile : !pto.tile_buf + %7 = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%3 : !pto.partition_tensor_view<32x32xf32>) outs(%5 : !pto.tile_buf) + pto.tload ins(%4 : !pto.partition_tensor_view<32x32xf32>) outs(%6 : !pto.tile_buf) + pto.tmul ins(%5, %6 : !pto.tile_buf, !pto.tile_buf) outs(%7 : !pto.tile_buf) + %8 = pto.partition_view %2, offsets = [%c0, %c0], sizes = [%c32, %c32] : !pto.tensor_view -> !pto.partition_tensor_view<32x32xf32> + pto.tstore ins(%7 : !pto.tile_buf) outs(%8 : !pto.partition_tensor_view<32x32xf32>) return } }