hw-native-sys · Hzfengsy · Feb 26, 2026 · Feb 26, 2026
diff --git a/docs/dev/codegen/00-pto_codegen.md b/docs/dev/codegen/00-pto_codegen.md
@@ -8,9 +8,10 @@ The PTO Codegen (`PTOCodegen`) generates MLIR code in PTO-ISA dialect from PyPTO
 
 - **Automatic MLIR Generation**: Converts PyPTO IR to PTO-ISA MLIR dialect
 - **Structured Code Generation**: Outputs constants, tensor views, allocations in order
-- **Implicit Lowering**: Automatically generates `pto.subview` from `block.load`/`block.store`
+- **Implicit Lowering**: Automatically generates `pto.partition_view` from `block.load`/`block.store`
 - **MemRef-based Allocation**: Maps IR MemRef objects to `pto.alloc_tile` operations
-- **Type-aware Conversion**: Handles TensorType, TileType, ScalarType appropriately
+- **Type-aware Conversion**: Derives tile_buf/tensor_view types from TileType metadata
+- **PTOAS Type Annotations**: Emits typed `ins`/`outs` clauses for all operations
 
 ### Generation Order
 
@@ -25,32 +26,47 @@ The codegen generates MLIR in the following fixed order:
 
 ### Class Structure
 
-**Header**: `include/pypto/codegen/pto_codegen.h`
+**Header**: `include/pypto/codegen/pto/pto_codegen.h`
 
 ```cpp
 namespace pypto::codegen {
 
-class PTOCodegen {
+class PTOCodegen : public CodegenBase {
  public:
-  PTOCodegen() = default;
-  ~PTOCodegen() = default;
+  PTOCodegen();
+  explicit PTOCodegen(const backend::Backend* backend);
 
-  // Generate PTO-ISA MLIR from program
   std::string Generate(const ir::ProgramPtr& program);
+
+  // CodegenBase interface
+  std::string GetCurrentResultTarget() const override;
+  void Emit(const std::string& line) override;
+  std::string GetExprAsCode(const ir::ExprPtr& expr) override;
+  std::string GetTypeString(const DataType& dtype) const override;
+
+  // PTO-specific helpers for operator codegen
+  std::string NewTemp();
+  std::string GetOrCreateTensorView(const ir::VarPtr& tensor);
+  std::string GetIndexConstant(int64_t val);
+  std::string GetOrEmitFloatConstant(double value, const std::string& mlir_type = "f32");
+  std::string GetTensorViewTypeString(const ir::TensorType* tensor_type) const;
+  std::string GetTileBufTypeString(const ir::MemRef* memref) const;
+  std::string GetExprTypeAnnotation(const ir::ExprPtr& expr);
+  std::string GetCurrentResultTileBufTypeString() const;
 };
 
 }  // namespace codegen
 ```
 
 ### Implementation Components
 
-**File**: `src/codegen/pto_codegen.cpp`
+**File**: `src/codegen/pto/pto_codegen.cpp`
 
 | Component | Purpose |
 | --------- | ------- |
-| `PTOMLIRCodegen` | Main visitor class for IR traversal |
-| `MemRefCollectorVisitor` | Collects all MemRef objects for allocation |
-| Helper functions | `DataTypeToMLIR()`, `MemorySpaceToMLIR()` |
+| `PTOCodegen` | Main visitor class (inherits `CodegenBase`) for IR traversal |
+| `MemRefCollectorVisitor` | Collects MemRef objects and their associated TileType for allocation |
+| Helper functions | `DataTypeToMLIRImpl()`, `MemorySpaceToMLIR()` |
 
 ## Python API
 
@@ -95,8 +111,8 @@ print(pto_code)
 
 | PyPTO Operation | Generated PTO-ISA |
 | --------------- | ----------------- |
-| `block.load(tensor, [row, col], [h, w])` | `pto.subview` + `pto.tload` |
-| `block.store(tile, [row, col], [h, w], tensor)` | `pto.subview` + `pto.tstore` |
+| `block.load(tensor, [row, col], [h, w])` | `pto.partition_view` + `pto.tload` |
+| `block.store(tile, [row, col], [h, w], tensor)` | `pto.partition_view` + `pto.tstore` |
 | `block.mul(lhs, rhs)` | `pto.tmul` |
 | `block.add(a, b, c)` | `pto.taddc` (3-operand add) |
 | `block.adds(tile, scalar)` | `pto.tadds` (tile + scalar) |
@@ -119,29 +135,30 @@ For each `TensorType` parameter, the codegen generates:
 %0 = pto.make_tensor_view %arg0,
      shape = [%c32, %c32]
      strides = [%c32, %c1]
-     : !pto.tensor_view<2xf32>
+     : !pto.tensor_view<?x?xf32>
 ```
 
 **Key aspects**:
 
 - Shape from `TensorType.shape_`
 - Strides computed as row-major: `[dim1, 1]` for 2D tensors
 - Constants (`%c32`, `%c1`) auto-generated
+- Tensor view type uses `?` for each dimension (e.g., `?x?xf32` for 2D)
 
 ### Allocation Generation
 
-Based on MemRef objects attached to TileType variables:
+Based on MemRef objects attached to TileType variables. The codegen derives tile dimensions and dtype from the associated TileType:
 
 ```mlir
-%0 = pto.alloc_tile : <loc=ub, dtype=f32, rows=32, cols=32,
+%0 = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32,
                        v_row=32, v_col=32, blayout=row_major,
                        slayout=none_box, fractal=512, pad=0>
 ```
 
 **MemRef → alloc_tile mapping**:
 
-- Memory space (`MemRef.memory_space_`) → `loc` attribute
-- Tile dimensions inferred from usage context
+- Memory space (`MemRef.memory_space_`) → `loc` attribute (using PTO address space names)
+- Tile dtype and dimensions derived from associated TileType metadata
 - One allocation per unique MemRef
 
 ### Load Operation Transformation
@@ -155,21 +172,21 @@ tile_a = pl.load(tensor_a, [0, 0], [32, 32])
 **Generated MLIR** (two operations):
 
 ```mlir
-# 1. Create tile view
-%3 = pto.subview %tensor_view, offsets = [%c0, %c0],
+# 1. Create partition view
+%3 = pto.partition_view %tensor_view, offsets = [%c0, %c0],
                  sizes = [%c32, %c32]
-                 : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32>
+                 : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<32x32xf32>
 
 # 2. Load into tile buffer
-pto.tload ins(%3 : !pto.tile_view<32x32xf32>)
-          outs(%tile_buf : !pto.tile_buf<loc=ub, ...>)
+pto.tload ins(%3 : !pto.partition_tensor_view<32x32xf32>)
+          outs(%tile_buf : !pto.tile_buf<loc=vec, ...>)
 ```
 
 **Key transformations**:
 
 - Tensor parameter → tensor_view lookup
 - Offsets/sizes from `block.load` arguments
-- Output tile_buf from variable's MemRef
+- Output tile_buf from variable's MemRef with type derived from TileType
 
 ### Store Operation Transformation
 
@@ -182,14 +199,14 @@ pl.store(tile_c, [0, 0], [32, 32], tensor_out)
 **Generated MLIR**:
 
 ```mlir
-# 1. Create tile view for output
-%5 = pto.subview %output_view, offsets = [%c0, %c0],
+# 1. Create partition view for output
+%5 = pto.partition_view %output_view, offsets = [%c0, %c0],
                  sizes = [%c32, %c32]
-                 : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32>
+                 : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<32x32xf32>
 
 # 2. Store from tile buffer
-pto.tstore ins(%tile_buf : !pto.tile_buf<loc=ub, ...>)
-           outs(%5 : !pto.tile_view<32x32xf32>)
+pto.tstore ins(%tile_buf : !pto.tile_buf<loc=vec, ...>)
+           outs(%5 : !pto.partition_tensor_view<32x32xf32>)
 ```
 
 ### Compute Operations
@@ -214,6 +231,7 @@ pto.tmul ins(%tile_a_buf : !pto.tile_buf<...>,
 
 - Result variable's MemRef determines output tile_buf
 - Input operands resolved through variable name lookup
+- All `ins`/`outs` clauses include type annotations
 
 ## Complete Example
 
@@ -254,38 +272,38 @@ module {
 
     // Tensor views
     %3 = pto.make_tensor_view %arg0, shape = [%c32, %c32]
-         strides = [%c32, %c1] : !pto.tensor_view<2xf32>
+         strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
     %4 = pto.make_tensor_view %arg1, shape = [%c32, %c32]
-         strides = [%c32, %c1] : !pto.tensor_view<2xf32>
+         strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
     %5 = pto.make_tensor_view %arg2, shape = [%c32, %c32]
-         strides = [%c32, %c1] : !pto.tensor_view<2xf32>
+         strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
 
     // Allocations
-    %0 = pto.alloc_tile : <loc=ub, dtype=f32, rows=32, cols=32, ...>
-    %1 = pto.alloc_tile : <loc=ub, dtype=f32, rows=32, cols=32, ...>
-    %2 = pto.alloc_tile : <loc=ub, dtype=f32, rows=32, cols=32, ...>
+    %0 = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, ...>
+    %1 = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, ...>
+    %2 = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, ...>
 
     // Load tile_a
-    %6 = pto.subview %3, offsets = [%c0, %c0], sizes = [%c32, %c32]
-         : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32>
-    pto.tload ins(%6 : !pto.tile_view<32x32xf32>)
+    %6 = pto.partition_view %3, offsets = [%c0, %c0], sizes = [%c32, %c32]
+         : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<32x32xf32>
+    pto.tload ins(%6 : !pto.partition_tensor_view<32x32xf32>)
               outs(%0 : !pto.tile_buf<...>)
 
     // Load tile_b
-    %7 = pto.subview %4, offsets = [%c0, %c0], sizes = [%c32, %c32]
-         : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32>
-    pto.tload ins(%7 : !pto.tile_view<32x32xf32>)
+    %7 = pto.partition_view %4, offsets = [%c0, %c0], sizes = [%c32, %c32]
+         : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<32x32xf32>
+    pto.tload ins(%7 : !pto.partition_tensor_view<32x32xf32>)
               outs(%1 : !pto.tile_buf<...>)
 
     // Multiply
     pto.tmul ins(%0 : !pto.tile_buf<...>, %1 : !pto.tile_buf<...>)
              outs(%2 : !pto.tile_buf<...>)
 
     // Store tile_c
-    %8 = pto.subview %5, offsets = [%c0, %c0], sizes = [%c32, %c32]
-         : !pto.tensor_view<2xf32> -> !pto.tile_view<32x32xf32>
+    %8 = pto.partition_view %5, offsets = [%c0, %c0], sizes = [%c32, %c32]
+         : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<32x32xf32>
     pto.tstore ins(%2 : !pto.tile_buf<...>)
-               outs(%8 : !pto.tile_view<32x32xf32>)
+               outs(%8 : !pto.partition_tensor_view<32x32xf32>)
 
     return
   }
@@ -303,6 +321,7 @@ The codegen maintains several mappings to track MLIR variable names:
 | `var_to_mlir_` | IR variable → MLIR SSA name | `"tile_a"` → `"%0"` |
 | `tensor_to_view_` | Parameter → tensor_view | `"a"` → `"%3"` |
 | `memref_to_mlir_` | MemRef pointer → tile_buf | `memref.get()` → `"%0"` |
+| `memref_to_tile_type_` | MemRef pointer → TileType | Used for deriving tile_buf types |
 
 **SSA value naming**:
 
@@ -324,7 +343,8 @@ The codegen:
 2. Resolves `tile_b` → `%1` via `var_to_mlir_`
 3. Gets `tile_c`'s MemRef from its TileType
 4. Maps MemRef → `%2` via `memref_to_mlir_`
-5. Generates: `pto.tmul ins(%0, %1) outs(%2)`
+5. Gets tile_buf type from `memref_to_tile_type_`
+6. Generates: `pto.tmul ins(%0 : !pto.tile_buf<...>, %1 : !pto.tile_buf<...>) outs(%2 : !pto.tile_buf<...>)`
 
 ## Type Conversions
 
@@ -342,42 +362,44 @@ The codegen:
 
 ### Memory Space Mapping
 
-| PyPTO MemorySpace | PTO-ISA loc |
-| ----------------- | ----------- |
-| `MemorySpace::DDR` | `ddr` |
-| `MemorySpace::UB` | `ub` (unified buffer) |
-| `MemorySpace::L1` | `l1` |
-| `MemorySpace::L0A` | `l0a` |
-| `MemorySpace::L0B` | `l0b` |
-| `MemorySpace::L0C` | `l0c` |
+| PyPTO MemorySpace | PTO Address Space |
+| ----------------- | ----------------- |
+| `MemorySpace::DDR` | `gm` (global memory) |
+| `MemorySpace::UB` | `vec` (vector buffer) |
+| `MemorySpace::L1` | `mat` (matrix buffer) |
+| `MemorySpace::L0A` | `left` |
+| `MemorySpace::L0B` | `right` |
+| `MemorySpace::L0C` | `acc` (accumulator) |
 
 ### Tile Buffer Attributes
 
-Generated `alloc_tile` operations include:
+Generated `alloc_tile` operations derive dtype and dimensions from TileType metadata, and layout/fractal/pad from the associated TileView (when available):
 
 ```mlir
 !pto.tile_buf<
-  loc=ub,              // Memory space
-  dtype=f32,           // Element data type
-  rows=32,             // Tile height
-  cols=32,             // Tile width
-  v_row=32,            // Virtual row size
-  v_col=32,            // Virtual column size
-  blayout=row_major,   // Block layout
-  slayout=none_box,    // Sub-layout
-  fractal=512,         // Fractal parameter
-  pad=0                // Padding
+  loc=vec,             // PTO address space (from MemorySpace)
+  dtype=f32,           // Element data type (from TileType)
+  rows=32,             // Tile height (from TileType shape)
+  cols=32,             // Tile width (from TileType shape)
+  v_row=32,            // Virtual row size (= rows)
+  v_col=32,            // Virtual column size (= cols)
+  blayout=row_major,   // Block layout (from TileView, default: row_major)
+  slayout=none_box,    // Scatter layout (from TileView, default: none_box)
+  fractal=512,         // Fractal size (from TileView, default: 512)
+  pad=0                // Pad mode as int (from TileView, default: 0/null)
 >
 ```
 
-## Limitations and Future Work
+**TileView-derived attributes**:
 
-### Current Limitations
+| Attribute | Source | Enum Values | Default |
+| --------- | ------ | ----------- | ------- |
+| `blayout` | `TileView::blayout` | `none_box`, `row_major`, `col_major` | `row_major` |
+| `slayout` | `TileView::slayout` | `none_box`, `row_major`, `col_major` | `none_box` |
+| `fractal` | `TileView::fractal` | uint64 | `512` |
+| `pad` | `TileView::pad` | `null(0)`, `zero(1)`, `max(2)`, `min(3)` | `null(0)` |
 
-1. **Fixed Tile Attributes**: `rows`, `cols`, `blayout` etc. use default values (32x32, row_major)
-2. **2D Tensors Only**: Shape/stride generation assumes 2D tensors
-3. **Single Memory Space**: All allocations use `ub` (unified buffer) by default
-4. **Limited Operations**: Only basic block operations supported
+When no TileView is associated with the MemRef, the codegen falls back to the default values listed above.
 
 ## See Also
 

diff --git a/include/pypto/codegen/pto/pto_codegen.h b/include/pypto/codegen/pto/pto_codegen.h
@@ -14,6 +14,7 @@
 
 #include <cstdint>
 #include <map>
+#include <memory>
 #include <set>
 #include <sstream>
 #include <string>
@@ -38,7 +39,7 @@ namespace codegen {
  *
  * Generates PTO-ISA MLIR format code from PyPTO IR Program.
  * Traverses the IR using the visitor pattern (aligned with CCECodegen).
- * Automatically generates make_tensor_view, subview, and alloc_tile instructions.
+ * Automatically generates make_tensor_view, partition_view, and alloc_tile instructions.
  */
 class PTOCodegen : public CodegenBase {
  public:
@@ -102,6 +103,26 @@ class PTOCodegen : public CodegenBase {
    */
   std::string GetOrEmitFloatConstant(double value, const std::string& mlir_type = "f32");
 
+  /**
+   * @brief Get tensor_view type string for a TensorType (e.g., "!pto.tensor_view<?x?xf32>")
+   */
+  std::string GetTensorViewTypeString(const ir::TensorType* tensor_type) const;
+
+  /**
+   * @brief Get tile_buf type string for a MemRef (e.g., "!pto.tile_buf<loc=vec, dtype=f32, ...>")
+   */
+  std::string GetTileBufTypeString(const ir::MemRef* memref) const;
+
+  /**
+   * @brief Get type annotation for an expression (for ins/outs clauses)
+   */
+  std::string GetExprTypeAnnotation(const ir::ExprPtr& expr);
+
+  /**
+   * @brief Get tile_buf type string for the current assignment result target
+   */
+  std::string GetCurrentResultTileBufTypeString() const;
+
  protected:
   // Override visitor methods for code generation - Statements
   void VisitStmt_(const ir::AssignStmtPtr& op) override;
@@ -156,6 +177,7 @@ class PTOCodegen : public CodegenBase {
   std::map<std::string, std::string> tensor_to_view_;
   std::map<const ir::MemRef*, std::string> memref_to_mlir_;
   std::map<std::string, const ir::MemRef*> var_to_memref_;
+  std::map<const ir::MemRef*, std::shared_ptr<const ir::TileType>> memref_to_tile_type_;
   std::set<int64_t> emitted_constants_;
   std::set<double> emitted_float_constants_;
   std::map<double, std::string> float_const_names_;
@@ -165,6 +187,7 @@ class PTOCodegen : public CodegenBase {
   // Current function context
   ir::FunctionPtr current_function_;
   std::string current_result_buf_;
+  std::shared_ptr<const ir::TileType> current_result_tile_type_;
 
   const backend::Backend* backend_;  ///< Backend instance for querying op info
 };

diff --git a/include/pypto/ir/scalar_expr.h b/include/pypto/ir/scalar_expr.h
@@ -388,7 +388,9 @@ inline BinaryOperands PromoteBinaryOperands(const ExprPtr& left, const ExprPtr&
   DataType left_dtype = GetScalarDtype(left);
   DataType right_dtype = GetScalarDtype(right);
   DataType promoted_dtype = PromoteSameCategoryDtype(left_dtype, right_dtype, op_name);
-  return {MaybeCast(left, promoted_dtype, span), MaybeCast(right, promoted_dtype, span), promoted_dtype};
+  ExprPtr promoted_left = MaybeCast(left, promoted_dtype, span);
+  ExprPtr promoted_right = MaybeCast(right, promoted_dtype, span);
+  return {std::move(promoted_left), std::move(promoted_right), promoted_dtype};
 }
 
 inline BinaryOperands PromoteIntBinaryOperands(const ExprPtr& left, const ExprPtr& right,
@@ -400,7 +402,9 @@ inline BinaryOperands PromoteIntBinaryOperands(const ExprPtr& left, const ExprPt
                     " and " + right_dtype.ToString());
   }
   DataType promoted_dtype = PromoteSameCategoryDtype(left_dtype, right_dtype, op_name);
-  return {MaybeCast(left, promoted_dtype, span), MaybeCast(right, promoted_dtype, span), promoted_dtype};
+  ExprPtr promoted_left = MaybeCast(left, promoted_dtype, span);
+  ExprPtr promoted_right = MaybeCast(right, promoted_dtype, span);
+  return {std::move(promoted_left), std::move(promoted_right), promoted_dtype};
 }
 
 // ========== Binary Operator Construction Functions ==========