From 54d63561b60351ee743188ab2c9f9ccc15ca2b53 Mon Sep 17 00:00:00 2001
From: Sylvain Noiry <sylvain.noiry@inria.fr>
Date: Thu, 5 Mar 2026 12:31:39 +0100
Subject: [PATCH] [Mlir] Skip default schedule for linalg.fill

Do not schedule the initialization operation linalg.fill, which is not
controllable by the user. The default schedule applied can overconstraint
lowering passes trying to fuse/optimize it.
---
 src/xtc/backends/mlir/MlirCompilerPasses.py   |   3 +
 .../padding/test_gen_pad_dict_conv2d_mlir.py  | 240 +++++++-----------
 .../test_gen_pad_int_matmul_unpad_mlir.py     | 211 ++++++---------
 .../padding/test_gen_pad_tuple_conv2d_mlir.py | 238 +++++++----------
 .../test_gen_pad_tuple_matmul_unpad_mlir.py   | 211 ++++++---------
 .../padding/test_pad_constant_conv2d_mlir.py  | 232 ++++++-----------
 .../backends/padding/test_pad_conv2d_mlir.py  | 232 ++++++-----------
 .../padding/test_pad_matmul_unpad_mlir.py     | 215 ++++++----------
 .../test_pad_tuple_matmul_unpad_mlir.py       | 215 ++++++----------
 .../backends/test_conv2d_mini_mlir.py         | 145 ++++-------
 .../backends/test_conv2d_r181_mlir.py         |  67 ++---
 .../backends/test_conv2d_r181_mlir_sv.py      |  87 +++----
 tests/filecheck/backends/test_matmul_mlir.py  |  49 ++--
 .../backends/test_matmul_mlir_distributed.py  | 143 +++++------
 .../backends/test_matmul_mlir_parallel.py     | 119 ++++-----
 .../backends/test_matmul_ndiv_mlir.py         |  49 ++--
 .../backends/test_matmul_relu_mlir.py         |  53 ++--
 .../backends/test_matmul_scalar_mlir.py       | 131 +++++-----
 .../backends/test_mlir_pack_no_sdist.py       | 133 +++++-----
 .../backends/test_mlir_pack_sdist.py          | 139 +++++-----
 .../schedules/test_descript_slice_bigger.py   |  66 ++---
 .../schedules/test_descript_slice_smaller.py  |  66 ++---
 .../schedules/test_matmul_descript_mlir.py    |  49 ++--
 23 files changed, 1201 insertions(+), 1892 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index fe6120de..b221e2bf 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -189,6 +189,9 @@ def _generate_scheduling(self) -> OpResult:
         assert self._named_sequence is not None
         handle = None
         for schedule in self._nodes_schedules:
+            # Skip linalg.fill
+            if schedule.node_name[-1] == "0":  # identify with naming convention
+                continue
             self._create_sdist_meshes(schedule)
             handle = structured_match(
                 results_=transform.AnyOpType.get(),
diff --git a/tests/filecheck/backends/padding/test_gen_pad_dict_conv2d_mlir.py b/tests/filecheck/backends/padding/test_gen_pad_dict_conv2d_mlir.py
index 64c65795..423865fa 100644
--- a/tests/filecheck/backends/padding/test_gen_pad_dict_conv2d_mlir.py
+++ b/tests/filecheck/backends/padding/test_gen_pad_dict_conv2d_mlir.py
@@ -58,7 +58,7 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -67,39 +67,21 @@
 # CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./l" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./l" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./f" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_29 "./f" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_31 "./r" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_33 "./s" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_35 "./c" : !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_19 "./c" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -113,141 +95,93 @@
 # CHECK-NEXT:    func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x12x12x3xf32>
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloca : memref<1x12x12x3xf32>)
+# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
 # CHECK-NEXT:      %c1_0 = arith.constant 1 : index
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1_0 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:        %c12 = arith.constant 12 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c12 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c12_16 = arith.constant 12 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c12_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c3 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./l"}
-# CHECK-NEXT:          } {"./k"}
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:      %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:        %subview_5 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %c0_7 = arith.constant 0 : index
 # CHECK-NEXT:        %c8 = arith.constant 8 : index
-# CHECK-NEXT:        %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_13 to %c8 step %c1_14 {
-# CHECK-NEXT:          %subview_15 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_12[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_17 = arith.constant 0 : index
-# CHECK-NEXT:          %c8_18 = arith.constant 8 : index
-# CHECK-NEXT:          %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_17 to %c8_18 step %c1_19 {
-# CHECK-NEXT:            %subview_20 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_21 = memref.subview %subview_16[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:        %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_7 to %c8 step %c1_8 {
+# CHECK-NEXT:          %subview_9 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_6[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:          %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:          %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 {
+# CHECK-NEXT:            %subview_14 = memref.subview %subview_9[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_22 to %c3 step %c1_23 {
-# CHECK-NEXT:              %subview_24 = memref.subview %subview_20[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_25 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_24 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_16 to %c3 step %c1_17 {
+# CHECK-NEXT:              %subview_18 = memref.subview %subview_14[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_19 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_18 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./l"}
 # CHECK-NEXT:          } {"./k"}
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %cst_4 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_5 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
+# CHECK-NEXT:      %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:      %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 {
+# CHECK-NEXT:        %subview_5 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:        %subview_7 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c4 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_16 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c4_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_8 to %c4 step %c1_9 {
+# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_5[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_11 = memref.subview %subview_6[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:          %subview_12 = memref.subview %subview_7[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:          %c4_14 = arith.constant 4 : index
+# CHECK-NEXT:          %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_13 to %c4_14 step %c1_15 {
+# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_10[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_17 = memref.subview %subview_11[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:            %subview_18 = memref.subview %subview_12[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:            %c0_19 = arith.constant 0 : index
 # CHECK-NEXT:            %c16 = arith.constant 16 : index
 # CHECK-NEXT:            %c1_20 = arith.constant 1 : index
 # CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c16 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_conv_0_} ins(%cst_4 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./f"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c1_9 step %c1_10 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:        %subview_13 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_14 = arith.constant 0 : index
-# CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_14 to %c4 step %c1_15 {
-# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_11[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_17 = memref.subview %subview_12[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:          %subview_18 = memref.subview %subview_13[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_20 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_21 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_19 to %c4_20 step %c1_21 {
-# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
-# CHECK-NEXT:            %subview_22 = memref.subview %subview_16[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_23 = memref.subview %subview_17[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:            %subview_24 = memref.subview %subview_18[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:            %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:            %c16 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_25 to %c16 step %c1_26 {
-# CHECK-NEXT:              %subview_27 = memref.subview %subview_22[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_28 = memref.subview %subview_23[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:              %subview_29 = memref.subview %subview_24[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:              %subview_21 = memref.subview %subview_16[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_22 = memref.subview %subview_17[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:              %subview_23 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:              %c0_24 = arith.constant 0 : index
 # CHECK-NEXT:              %c5 = arith.constant 5 : index
-# CHECK-NEXT:              %c1_31 = arith.constant 1 : index
-# CHECK-NEXT:              scf.for %arg7 = %c0_30 to %c5 step %c1_31 {
-# CHECK-NEXT:                %subview_32 = memref.subview %subview_27[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                %subview_33 = memref.subview %subview_28[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                %subview_34 = memref.subview %subview_29[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                %c0_35 = arith.constant 0 : index
-# CHECK-NEXT:                %c5_36 = arith.constant 5 : index
-# CHECK-NEXT:                %c1_37 = arith.constant 1 : index
-# CHECK-NEXT:                scf.for %arg8 = %c0_35 to %c5_36 step %c1_37 {
-# CHECK-NEXT:                  %subview_38 = memref.subview %subview_32[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_39 = memref.subview %subview_33[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_40 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %c0_41 = arith.constant 0 : index
+# CHECK-NEXT:              %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:              scf.for %arg7 = %c0_24 to %c5 step %c1_25 {
+# CHECK-NEXT:                %subview_26 = memref.subview %subview_21[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                %subview_27 = memref.subview %subview_22[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                %subview_28 = memref.subview %subview_23[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:                %c5_30 = arith.constant 5 : index
+# CHECK-NEXT:                %c1_31 = arith.constant 1 : index
+# CHECK-NEXT:                scf.for %arg8 = %c0_29 to %c5_30 step %c1_31 {
+# CHECK-NEXT:                  %subview_32 = memref.subview %subview_26[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_33 = memref.subview %subview_27[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_34 = memref.subview %subview_28[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %c0_35 = arith.constant 0 : index
 # CHECK-NEXT:                  %c3 = arith.constant 3 : index
-# CHECK-NEXT:                  %c1_42 = arith.constant 1 : index
-# CHECK-NEXT:                  scf.for %arg9 = %c0_41 to %c3 step %c1_42 {
-# CHECK-NEXT:                    %subview_43 = memref.subview %subview_38[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_44 = memref.subview %subview_39[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_45 = memref.subview %subview_40[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_44 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                    ^bb0(%in: f32, %in_46: f32, %out: f32):
-# CHECK-NEXT:                      %2 = arith.mulf %in, %in_46 : f32
+# CHECK-NEXT:                  %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:                  scf.for %arg9 = %c0_35 to %c3 step %c1_36 {
+# CHECK-NEXT:                    %subview_37 = memref.subview %subview_32[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_38 = memref.subview %subview_33[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_39 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_38 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                    ^bb0(%in: f32, %in_40: f32, %out: f32):
+# CHECK-NEXT:                      %2 = arith.mulf %in, %in_40 : f32
 # CHECK-NEXT:                      %3 = arith.addf %out, %2 : f32
 # CHECK-NEXT:                      linalg.yield %3 : f32
 # CHECK-NEXT:                    }
@@ -270,7 +204,7 @@
 # CHECK-NEXT:    outputs:
 # CHECK-NEXT:    - %3 : 1x4x4x16xfloat32
 # CHECK-NEXT:    nodes:
-# CHECK-NEXT:   - %2: pad(%0, padding={1: (2, 2), 2: (2, 2)}, constant_value=0) {name = 'pad'} : [1x8x8x3xfloat32] -> [1x12x12x3xfloat32] 
+# CHECK-NEXT:    - %2: pad(%0, padding={1: (2, 2), 2: (2, 2)}, constant_value=0) {name = 'pad'} : [1x8x8x3xfloat32] -> [1x12x12x3xfloat32]
 # CHECK-NEXT:    - %3: conv2d(%2, %1, stride=(2, 2)) {name = 'conv'} : [1x12x12x3xfloat32, 5x5x3x16xfloat32] -> [1x4x4x16xfloat32]
 # CHECK-NEXT:  
 # CHECK-NEXT:  CODE: 0
diff --git a/tests/filecheck/backends/padding/test_gen_pad_int_matmul_unpad_mlir.py b/tests/filecheck/backends/padding/test_gen_pad_int_matmul_unpad_mlir.py
index 4a131f44..721e42ca 100644
--- a/tests/filecheck/backends/padding/test_gen_pad_int_matmul_unpad_mlir.py
+++ b/tests/filecheck/backends/padding/test_gen_pad_int_matmul_unpad_mlir.py
@@ -57,43 +57,28 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_3 "./i" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./j" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_7 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_9 "./j" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./j" : !transform.any_op
-# CHECK-NEXT:      %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./j" : !transform.any_op
-# CHECK-NEXT:      %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./k" : !transform.any_op
-# CHECK-NEXT:      %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./k" : !transform.any_op
+# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./j" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -103,120 +88,84 @@
 # CHECK-NEXT:    func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<18x18xf32>
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c18 = arith.constant 18 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c18 step %c1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 18] [1, 1] : memref<18x18xf32> to memref<1x18xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c18_25 = arith.constant 18 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c18_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_27 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<18x18xf32>)
 # CHECK-NEXT:      %subview = memref.subview %alloca[2, 2] [14, 14] [1, 1] : memref<18x18xf32> to memref<14x14xf32, strided<[18, 1], offset: 38>>
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c14 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c14 step %c1_1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[18, 1], offset: 38>> to memref<1x14xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c14 step %c1 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[18, 1], offset: 38>> to memref<1x14xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<18x18xf32>
-# CHECK-NEXT:      %cst_3 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:      %c18_5 = arith.constant 18 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_4 to %c18_5 step %c1_6 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_2[%arg3, 0] [1, 18] [1, 1] : memref<18x18xf32> to memref<1x18xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c18_25 = arith.constant 18 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c18_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_B_pad_0_} ins(%cst_3 : f32) outs(%subview_27 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
+# CHECK-NEXT:      %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<18x18xf32>
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<18x18xf32>)
+# CHECK-NEXT:      %subview_2 = memref.subview %alloca_0[2, 2] [14, 14] [1, 1] : memref<18x18xf32> to memref<14x14xf32, strided<[18, 1], offset: 38>>
+# CHECK-NEXT:      %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_4 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_3 to %c14_4 step %c1_5 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview_2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[18, 1], offset: 38>> to memref<1x14xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview_7 = memref.subview %alloca_2[2, 2] [14, 14] [1, 1] : memref<18x18xf32> to memref<14x14xf32, strided<[18, 1], offset: 38>>
+# CHECK-NEXT:      %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<18x18xf32>
+# CHECK-NEXT:      %cst_7 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%alloca_6 : memref<18x18xf32>)
 # CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_9 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c14_9 step %c1_10 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview_7[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[18, 1], offset: 38>> to memref<1x14xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %alloca_11 = memref.alloca() {alignment = 256 : i64} : memref<18x18xf32>
-# CHECK-NEXT:      %cst_12 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:      %c18_14 = arith.constant 18 : index
-# CHECK-NEXT:      %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_13 to %c18_14 step %c1_15 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_11[%arg3, 0] [1, 18] [1, 1] : memref<18x18xf32> to memref<1x18xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c18_25 = arith.constant 18 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c18_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_12 : f32) outs(%subview_27 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:      %c18_17 = arith.constant 18 : index
-# CHECK-NEXT:      %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_16 to %c18_17 step %c1_18 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 18] [1, 1] : memref<18x18xf32> to memref<1x18xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %alloca_2[0, 0] [18, 18] [1, 1] : memref<18x18xf32> to memref<18x18xf32, strided<[18, 1]>>
-# CHECK-NEXT:        %subview_25 = memref.subview %alloca_11[%arg3, 0] [1, 18] [1, 1] : memref<18x18xf32> to memref<1x18xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %c0_26 = arith.constant 0 : index
-# CHECK-NEXT:        %c18_27 = arith.constant 18 : index
-# CHECK-NEXT:        %c1_28 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_26 to %c18_27 step %c1_28 {
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_23[0, 0] [1, 18] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x18xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          %subview_30 = memref.subview %subview_24[0, %arg4] [18, 1] [1, 1] : memref<18x18xf32, strided<[18, 1]>> to memref<18x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          %subview_31 = memref.subview %subview_25[0, %arg4] [1, 1] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          %c0_32 = arith.constant 0 : index
-# CHECK-NEXT:          %c18_33 = arith.constant 18 : index
-# CHECK-NEXT:          %c1_34 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_32 to %c18_33 step %c1_34 {
-# CHECK-NEXT:            %subview_35 = memref.subview %subview_29[0, %arg5] [1, 1] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:            %subview_36 = memref.subview %subview_30[%arg5, 0] [1, 1] [1, 1] : memref<18x1xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:            %subview_37 = memref.subview %subview_31[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_35, %subview_36 : memref<1x1xf32, strided<[18, 1], offset: ?>>, memref<1x1xf32, strided<[18, 1], offset: ?>>) outs(%subview_37 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
+# CHECK-NEXT:      %c18 = arith.constant 18 : index
+# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c18 step %c1_9 {
+# CHECK-NEXT:        %subview_14 = memref.subview %alloca[%arg3, 0] [1, 18] [1, 1] : memref<18x18xf32> to memref<1x18xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %alloca_0[0, 0] [18, 18] [1, 1] : memref<18x18xf32> to memref<18x18xf32, strided<[18, 1]>>
+# CHECK-NEXT:        %subview_16 = memref.subview %alloca_6[%arg3, 0] [1, 18] [1, 1] : memref<18x18xf32> to memref<1x18xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:        %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:        %c18_18 = arith.constant 18 : index
+# CHECK-NEXT:        %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_17 to %c18_18 step %c1_19 {
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_14[0, 0] [1, 18] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x18xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:          %subview_21 = memref.subview %subview_15[0, %arg4] [18, 1] [1, 1] : memref<18x18xf32, strided<[18, 1]>> to memref<18x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:          %subview_22 = memref.subview %subview_16[0, %arg4] [1, 1] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:          %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:          %c18_24 = arith.constant 18 : index
+# CHECK-NEXT:          %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_23 to %c18_24 step %c1_25 {
+# CHECK-NEXT:            %subview_26 = memref.subview %subview_20[0, %arg5] [1, 1] [1, 1] : memref<1x18xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:            %subview_27 = memref.subview %subview_21[%arg5, 0] [1, 1] [1, 1] : memref<18x1xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:            %subview_28 = memref.subview %subview_22[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_26, %subview_27 : memref<1x1xf32, strided<[18, 1], offset: ?>>, memref<1x1xf32, strided<[18, 1], offset: ?>>) outs(%subview_28 : memref<1x1xf32, strided<[18, 1], offset: ?>>)
 # CHECK-NEXT:          } {"./k"}
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview_19 = memref.subview %alloca_11[2, 2] [14, 14] [1, 1] : memref<18x18xf32> to memref<14x14xf32, strided<[18, 1], offset: 38>>
-# CHECK-NEXT:      %c0_20 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_21 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_22 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_20 to %c14_21 step %c1_22 {
-# CHECK-NEXT:        %subview_23 = memref.subview %subview_19[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[18, 1], offset: 38>> to memref<1x14xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_28 : memref<1x1xf32, strided<[18, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
+# CHECK-NEXT:      %subview_10 = memref.subview %alloca_6[2, 2] [14, 14] [1, 1] : memref<18x18xf32> to memref<14x14xf32, strided<[18, 1], offset: 38>>
+# CHECK-NEXT:      %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_12 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_11 to %c14_12 step %c1_13 {
+# CHECK-NEXT:        %subview_14 = memref.subview %subview_10[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[18, 1], offset: 38>> to memref<1x14xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[18, 1], offset: ?>> to memref<1x1xf32, strided<[18, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_19 : memref<1x1xf32, strided<[18, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
 # CHECK-NEXT:      return
diff --git a/tests/filecheck/backends/padding/test_gen_pad_tuple_conv2d_mlir.py b/tests/filecheck/backends/padding/test_gen_pad_tuple_conv2d_mlir.py
index 35fa57c5..8c622867 100644
--- a/tests/filecheck/backends/padding/test_gen_pad_tuple_conv2d_mlir.py
+++ b/tests/filecheck/backends/padding/test_gen_pad_tuple_conv2d_mlir.py
@@ -57,7 +57,7 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -66,39 +66,21 @@
 # CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./l" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./l" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./f" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_29 "./f" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_31 "./r" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_33 "./s" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_35 "./c" : !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_19 "./c" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -112,141 +94,93 @@
 # CHECK-NEXT:    func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x12x12x3xf32>
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloca : memref<1x12x12x3xf32>)
+# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
 # CHECK-NEXT:      %c1_0 = arith.constant 1 : index
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1_0 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:        %c12 = arith.constant 12 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c12 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c12_16 = arith.constant 12 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c12_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c3 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./l"}
-# CHECK-NEXT:          } {"./k"}
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:      %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:        %subview_5 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %c0_7 = arith.constant 0 : index
 # CHECK-NEXT:        %c8 = arith.constant 8 : index
-# CHECK-NEXT:        %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_13 to %c8 step %c1_14 {
-# CHECK-NEXT:          %subview_15 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_12[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_17 = arith.constant 0 : index
-# CHECK-NEXT:          %c8_18 = arith.constant 8 : index
-# CHECK-NEXT:          %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_17 to %c8_18 step %c1_19 {
-# CHECK-NEXT:            %subview_20 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_21 = memref.subview %subview_16[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:        %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_7 to %c8 step %c1_8 {
+# CHECK-NEXT:          %subview_9 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_6[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:          %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:          %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 {
+# CHECK-NEXT:            %subview_14 = memref.subview %subview_9[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_22 to %c3 step %c1_23 {
-# CHECK-NEXT:              %subview_24 = memref.subview %subview_20[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_25 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_24 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_16 to %c3 step %c1_17 {
+# CHECK-NEXT:              %subview_18 = memref.subview %subview_14[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_19 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_18 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./l"}
 # CHECK-NEXT:          } {"./k"}
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %cst_4 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_5 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
+# CHECK-NEXT:      %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:      %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 {
+# CHECK-NEXT:        %subview_5 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:        %subview_7 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c4 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_16 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c4_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_8 to %c4 step %c1_9 {
+# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_5[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_11 = memref.subview %subview_6[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:          %subview_12 = memref.subview %subview_7[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:          %c4_14 = arith.constant 4 : index
+# CHECK-NEXT:          %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_13 to %c4_14 step %c1_15 {
+# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_10[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_17 = memref.subview %subview_11[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:            %subview_18 = memref.subview %subview_12[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:            %c0_19 = arith.constant 0 : index
 # CHECK-NEXT:            %c16 = arith.constant 16 : index
 # CHECK-NEXT:            %c1_20 = arith.constant 1 : index
 # CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c16 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_conv_0_} ins(%cst_4 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./f"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c1_9 step %c1_10 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:        %subview_13 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_14 = arith.constant 0 : index
-# CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_14 to %c4 step %c1_15 {
-# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_11[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_17 = memref.subview %subview_12[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:          %subview_18 = memref.subview %subview_13[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_20 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_21 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_19 to %c4_20 step %c1_21 {
-# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
-# CHECK-NEXT:            %subview_22 = memref.subview %subview_16[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_23 = memref.subview %subview_17[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:            %subview_24 = memref.subview %subview_18[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:            %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:            %c16 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_25 to %c16 step %c1_26 {
-# CHECK-NEXT:              %subview_27 = memref.subview %subview_22[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_28 = memref.subview %subview_23[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:              %subview_29 = memref.subview %subview_24[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:              %subview_21 = memref.subview %subview_16[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_22 = memref.subview %subview_17[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:              %subview_23 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:              %c0_24 = arith.constant 0 : index
 # CHECK-NEXT:              %c5 = arith.constant 5 : index
-# CHECK-NEXT:              %c1_31 = arith.constant 1 : index
-# CHECK-NEXT:              scf.for %arg7 = %c0_30 to %c5 step %c1_31 {
-# CHECK-NEXT:                %subview_32 = memref.subview %subview_27[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                %subview_33 = memref.subview %subview_28[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                %subview_34 = memref.subview %subview_29[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                %c0_35 = arith.constant 0 : index
-# CHECK-NEXT:                %c5_36 = arith.constant 5 : index
-# CHECK-NEXT:                %c1_37 = arith.constant 1 : index
-# CHECK-NEXT:                scf.for %arg8 = %c0_35 to %c5_36 step %c1_37 {
-# CHECK-NEXT:                  %subview_38 = memref.subview %subview_32[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_39 = memref.subview %subview_33[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_40 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %c0_41 = arith.constant 0 : index
+# CHECK-NEXT:              %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:              scf.for %arg7 = %c0_24 to %c5 step %c1_25 {
+# CHECK-NEXT:                %subview_26 = memref.subview %subview_21[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                %subview_27 = memref.subview %subview_22[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                %subview_28 = memref.subview %subview_23[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:                %c5_30 = arith.constant 5 : index
+# CHECK-NEXT:                %c1_31 = arith.constant 1 : index
+# CHECK-NEXT:                scf.for %arg8 = %c0_29 to %c5_30 step %c1_31 {
+# CHECK-NEXT:                  %subview_32 = memref.subview %subview_26[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_33 = memref.subview %subview_27[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_34 = memref.subview %subview_28[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %c0_35 = arith.constant 0 : index
 # CHECK-NEXT:                  %c3 = arith.constant 3 : index
-# CHECK-NEXT:                  %c1_42 = arith.constant 1 : index
-# CHECK-NEXT:                  scf.for %arg9 = %c0_41 to %c3 step %c1_42 {
-# CHECK-NEXT:                    %subview_43 = memref.subview %subview_38[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_44 = memref.subview %subview_39[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_45 = memref.subview %subview_40[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_44 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                    ^bb0(%in: f32, %in_46: f32, %out: f32):
-# CHECK-NEXT:                      %2 = arith.mulf %in, %in_46 : f32
+# CHECK-NEXT:                  %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:                  scf.for %arg9 = %c0_35 to %c3 step %c1_36 {
+# CHECK-NEXT:                    %subview_37 = memref.subview %subview_32[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_38 = memref.subview %subview_33[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_39 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_38 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                    ^bb0(%in: f32, %in_40: f32, %out: f32):
+# CHECK-NEXT:                      %2 = arith.mulf %in, %in_40 : f32
 # CHECK-NEXT:                      %3 = arith.addf %out, %2 : f32
 # CHECK-NEXT:                      linalg.yield %3 : f32
 # CHECK-NEXT:                    }
diff --git a/tests/filecheck/backends/padding/test_gen_pad_tuple_matmul_unpad_mlir.py b/tests/filecheck/backends/padding/test_gen_pad_tuple_matmul_unpad_mlir.py
index 6110c923..d899239a 100644
--- a/tests/filecheck/backends/padding/test_gen_pad_tuple_matmul_unpad_mlir.py
+++ b/tests/filecheck/backends/padding/test_gen_pad_tuple_matmul_unpad_mlir.py
@@ -57,43 +57,28 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_3 "./i" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./j" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_7 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_9 "./j" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./j" : !transform.any_op
-# CHECK-NEXT:      %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./j" : !transform.any_op
-# CHECK-NEXT:      %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./k" : !transform.any_op
-# CHECK-NEXT:      %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./k" : !transform.any_op
+# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./j" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -103,120 +88,84 @@
 # CHECK-NEXT:    func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c16 step %c1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>)
 # CHECK-NEXT:      %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c14 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c14 step %c1_1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c14 step %c1 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:      %cst_3 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_5 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_4 to %c16_5 step %c1_6 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_2[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_B_pad_0_} ins(%cst_3 : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>)
+# CHECK-NEXT:      %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_4 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_3 to %c14_4 step %c1_5 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview_2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview_7 = memref.subview %alloca_2[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:      %cst_7 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%alloca_6 : memref<16x16xf32>)
 # CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_9 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c14_9 step %c1_10 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview_7[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %alloca_11 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:      %cst_12 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_13 to %c16_14 step %c1_15 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_11[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_12 : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_17 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_16 to %c16_17 step %c1_18 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %alloca_2[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
-# CHECK-NEXT:        %subview_25 = memref.subview %alloca_11[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_26 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_27 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_28 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_26 to %c16_27 step %c1_28 {
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_23[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_30 = memref.subview %subview_24[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_31 = memref.subview %subview_25[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_32 = arith.constant 0 : index
-# CHECK-NEXT:          %c16_33 = arith.constant 16 : index
-# CHECK-NEXT:          %c1_34 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_32 to %c16_33 step %c1_34 {
-# CHECK-NEXT:            %subview_35 = memref.subview %subview_29[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            %subview_36 = memref.subview %subview_30[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            %subview_37 = memref.subview %subview_31[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_35, %subview_36 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_37 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %c16 = arith.constant 16 : index
+# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c16 step %c1_9 {
+# CHECK-NEXT:        %subview_14 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %alloca_0[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
+# CHECK-NEXT:        %subview_16 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:        %c16_18 = arith.constant 16 : index
+# CHECK-NEXT:        %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_17 to %c16_18 step %c1_19 {
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_14[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_21 = memref.subview %subview_15[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_22 = memref.subview %subview_16[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:          %c16_24 = arith.constant 16 : index
+# CHECK-NEXT:          %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_23 to %c16_24 step %c1_25 {
+# CHECK-NEXT:            %subview_26 = memref.subview %subview_20[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_27 = memref.subview %subview_21[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_28 = memref.subview %subview_22[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_26, %subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:          } {"./k"}
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview_19 = memref.subview %alloca_11[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:      %c0_20 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_21 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_22 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_20 to %c14_21 step %c1_22 {
-# CHECK-NEXT:        %subview_23 = memref.subview %subview_19[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
+# CHECK-NEXT:      %subview_10 = memref.subview %alloca_6[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_12 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_11 to %c14_12 step %c1_13 {
+# CHECK-NEXT:        %subview_14 = memref.subview %subview_10[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
 # CHECK-NEXT:      return
diff --git a/tests/filecheck/backends/padding/test_pad_constant_conv2d_mlir.py b/tests/filecheck/backends/padding/test_pad_constant_conv2d_mlir.py
index 58c5b977..4f35109d 100644
--- a/tests/filecheck/backends/padding/test_pad_constant_conv2d_mlir.py
+++ b/tests/filecheck/backends/padding/test_pad_constant_conv2d_mlir.py
@@ -57,7 +57,7 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./b" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -66,39 +66,21 @@
 # CHECK-NEXT:      transform.annotate %loops_3 "./w" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./c" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_11 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./c" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./f" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_29 "./f" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_31 "./r" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_33 "./s" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_35 "./c" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_19 "./c" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -112,141 +94,93 @@
 # CHECK-NEXT:    func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x12x12x3xf32>
 # CHECK-NEXT:      %cst = arith.constant 3.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloca : memref<1x12x12x3xf32>)
+# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
 # CHECK-NEXT:      %c1_0 = arith.constant 1 : index
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1_0 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:        %c12 = arith.constant 12 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c12 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c12_16 = arith.constant 12 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c12_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c3 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./c"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:      %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:        %subview_5 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %c0_7 = arith.constant 0 : index
 # CHECK-NEXT:        %c8 = arith.constant 8 : index
-# CHECK-NEXT:        %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_13 to %c8 step %c1_14 {
-# CHECK-NEXT:          %subview_15 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_12[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_17 = arith.constant 0 : index
-# CHECK-NEXT:          %c8_18 = arith.constant 8 : index
-# CHECK-NEXT:          %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_17 to %c8_18 step %c1_19 {
-# CHECK-NEXT:            %subview_20 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_21 = memref.subview %subview_16[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:        %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_7 to %c8 step %c1_8 {
+# CHECK-NEXT:          %subview_9 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_6[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:          %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:          %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 {
+# CHECK-NEXT:            %subview_14 = memref.subview %subview_9[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_22 to %c3 step %c1_23 {
-# CHECK-NEXT:              %subview_24 = memref.subview %subview_20[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_25 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_24 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_16 to %c3 step %c1_17 {
+# CHECK-NEXT:              %subview_18 = memref.subview %subview_14[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_19 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_18 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./c"}
 # CHECK-NEXT:          } {"./w"}
 # CHECK-NEXT:        } {"./h"}
 # CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %cst_4 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_5 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
+# CHECK-NEXT:      %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:      %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 {
+# CHECK-NEXT:        %subview_5 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:        %subview_7 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c4 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_16 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c4_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_8 to %c4 step %c1_9 {
+# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_5[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_11 = memref.subview %subview_6[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:          %subview_12 = memref.subview %subview_7[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:          %c4_14 = arith.constant 4 : index
+# CHECK-NEXT:          %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_13 to %c4_14 step %c1_15 {
+# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_10[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_17 = memref.subview %subview_11[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:            %subview_18 = memref.subview %subview_12[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:            %c0_19 = arith.constant 0 : index
 # CHECK-NEXT:            %c16 = arith.constant 16 : index
 # CHECK-NEXT:            %c1_20 = arith.constant 1 : index
 # CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c16 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_conv_0_} ins(%cst_4 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./f"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c1_9 step %c1_10 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:        %subview_13 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_14 = arith.constant 0 : index
-# CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_14 to %c4 step %c1_15 {
-# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_11[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_17 = memref.subview %subview_12[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:          %subview_18 = memref.subview %subview_13[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_20 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_21 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_19 to %c4_20 step %c1_21 {
-# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
-# CHECK-NEXT:            %subview_22 = memref.subview %subview_16[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_23 = memref.subview %subview_17[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:            %subview_24 = memref.subview %subview_18[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:            %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:            %c16 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_25 to %c16 step %c1_26 {
-# CHECK-NEXT:              %subview_27 = memref.subview %subview_22[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_28 = memref.subview %subview_23[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:              %subview_29 = memref.subview %subview_24[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:              %subview_21 = memref.subview %subview_16[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_22 = memref.subview %subview_17[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:              %subview_23 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:              %c0_24 = arith.constant 0 : index
 # CHECK-NEXT:              %c5 = arith.constant 5 : index
-# CHECK-NEXT:              %c1_31 = arith.constant 1 : index
-# CHECK-NEXT:              scf.for %arg7 = %c0_30 to %c5 step %c1_31 {
-# CHECK-NEXT:                %subview_32 = memref.subview %subview_27[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                %subview_33 = memref.subview %subview_28[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                %subview_34 = memref.subview %subview_29[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                %c0_35 = arith.constant 0 : index
-# CHECK-NEXT:                %c5_36 = arith.constant 5 : index
-# CHECK-NEXT:                %c1_37 = arith.constant 1 : index
-# CHECK-NEXT:                scf.for %arg8 = %c0_35 to %c5_36 step %c1_37 {
-# CHECK-NEXT:                  %subview_38 = memref.subview %subview_32[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_39 = memref.subview %subview_33[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_40 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %c0_41 = arith.constant 0 : index
+# CHECK-NEXT:              %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:              scf.for %arg7 = %c0_24 to %c5 step %c1_25 {
+# CHECK-NEXT:                %subview_26 = memref.subview %subview_21[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                %subview_27 = memref.subview %subview_22[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                %subview_28 = memref.subview %subview_23[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:                %c5_30 = arith.constant 5 : index
+# CHECK-NEXT:                %c1_31 = arith.constant 1 : index
+# CHECK-NEXT:                scf.for %arg8 = %c0_29 to %c5_30 step %c1_31 {
+# CHECK-NEXT:                  %subview_32 = memref.subview %subview_26[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_33 = memref.subview %subview_27[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_34 = memref.subview %subview_28[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %c0_35 = arith.constant 0 : index
 # CHECK-NEXT:                  %c3 = arith.constant 3 : index
-# CHECK-NEXT:                  %c1_42 = arith.constant 1 : index
-# CHECK-NEXT:                  scf.for %arg9 = %c0_41 to %c3 step %c1_42 {
-# CHECK-NEXT:                    %subview_43 = memref.subview %subview_38[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_44 = memref.subview %subview_39[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_45 = memref.subview %subview_40[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_44 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                    ^bb0(%in: f32, %in_46: f32, %out: f32):
-# CHECK-NEXT:                      %2 = arith.mulf %in, %in_46 : f32
+# CHECK-NEXT:                  %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:                  scf.for %arg9 = %c0_35 to %c3 step %c1_36 {
+# CHECK-NEXT:                    %subview_37 = memref.subview %subview_32[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_38 = memref.subview %subview_33[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_39 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_38 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                    ^bb0(%in: f32, %in_40: f32, %out: f32):
+# CHECK-NEXT:                      %2 = arith.mulf %in, %in_40 : f32
 # CHECK-NEXT:                      %3 = arith.addf %out, %2 : f32
 # CHECK-NEXT:                      linalg.yield %3 : f32
 # CHECK-NEXT:                    }
diff --git a/tests/filecheck/backends/padding/test_pad_conv2d_mlir.py b/tests/filecheck/backends/padding/test_pad_conv2d_mlir.py
index 7b97ee76..8c943ce6 100644
--- a/tests/filecheck/backends/padding/test_pad_conv2d_mlir.py
+++ b/tests/filecheck/backends/padding/test_pad_conv2d_mlir.py
@@ -57,7 +57,7 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./b" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -66,39 +66,21 @@
 # CHECK-NEXT:      transform.annotate %loops_3 "./w" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./c" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_11 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./c" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./f" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_29 "./f" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_31 "./r" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_33 "./s" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_35 "./c" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_19 "./c" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -112,141 +94,93 @@
 # CHECK-NEXT:    func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x12x12x3xf32>
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloca : memref<1x12x12x3xf32>)
+# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
 # CHECK-NEXT:      %c1_0 = arith.constant 1 : index
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1_0 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:        %c12 = arith.constant 12 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c12 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c12_16 = arith.constant 12 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c12_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c3 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./c"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %subview = memref.subview %alloca[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:      %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:        %subview_5 = memref.subview %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %subview[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %c0_7 = arith.constant 0 : index
 # CHECK-NEXT:        %c8 = arith.constant 8 : index
-# CHECK-NEXT:        %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_13 to %c8 step %c1_14 {
-# CHECK-NEXT:          %subview_15 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_12[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %c0_17 = arith.constant 0 : index
-# CHECK-NEXT:          %c8_18 = arith.constant 8 : index
-# CHECK-NEXT:          %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_17 to %c8_18 step %c1_19 {
-# CHECK-NEXT:            %subview_20 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_21 = memref.subview %subview_16[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:        %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_7 to %c8 step %c1_8 {
+# CHECK-NEXT:          %subview_9 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_6[0, %arg4, 0, 0] [1, 1, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:          %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:          %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 {
+# CHECK-NEXT:            %subview_14 = memref.subview %subview_9[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x8x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:            %c3 = arith.constant 3 : index
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_22 to %c3 step %c1_23 {
-# CHECK-NEXT:              %subview_24 = memref.subview %subview_20[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_25 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_24 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_16 to %c3 step %c1_17 {
+# CHECK-NEXT:              %subview_18 = memref.subview %subview_14[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_19 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              linalg.copy {__xtc_id_pad_} ins(%subview_18 : memref<1x1x1x1xf32, strided<[192, 24, 3, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./c"}
 # CHECK-NEXT:          } {"./w"}
 # CHECK-NEXT:        } {"./h"}
 # CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %cst_4 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_5 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 {
-# CHECK-NEXT:        %subview_11 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
+# CHECK-NEXT:      %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:      %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 {
+# CHECK-NEXT:        %subview_5 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:        %subview_6 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:        %subview_7 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_12 to %c4 step %c1_13 {
-# CHECK-NEXT:          %subview_14 = memref.subview %subview_11[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_16 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_15 to %c4_16 step %c1_17 {
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_14[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_8 to %c4 step %c1_9 {
+# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
+# CHECK-NEXT:          %subview_10 = memref.subview %subview_5[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_11 = memref.subview %subview_6[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:          %subview_12 = memref.subview %subview_7[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:          %c4_14 = arith.constant 4 : index
+# CHECK-NEXT:          %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_13 to %c4_14 step %c1_15 {
+# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_10[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_17 = memref.subview %subview_11[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:            %subview_18 = memref.subview %subview_12[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:            %c0_19 = arith.constant 0 : index
 # CHECK-NEXT:            %c16 = arith.constant 16 : index
 # CHECK-NEXT:            %c1_20 = arith.constant 1 : index
 # CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c16 step %c1_20 {
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_conv_0_} ins(%cst_4 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./f"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c1_9 step %c1_10 {
-# CHECK-NEXT:        %subview_11 = memref.subview %alloca[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_12 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:        %subview_13 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_14 = arith.constant 0 : index
-# CHECK-NEXT:        %c4 = arith.constant 4 : index
-# CHECK-NEXT:        %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_14 to %c4 step %c1_15 {
-# CHECK-NEXT:          %0 = affine.apply #map(%arg4)
-# CHECK-NEXT:          %subview_16 = memref.subview %subview_11[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_17 = memref.subview %subview_12[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:          %subview_18 = memref.subview %subview_13[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:          %c4_20 = arith.constant 4 : index
-# CHECK-NEXT:          %c1_21 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_19 to %c4_20 step %c1_21 {
-# CHECK-NEXT:            %1 = affine.apply #map(%arg5)
-# CHECK-NEXT:            %subview_22 = memref.subview %subview_16[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_23 = memref.subview %subview_17[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:            %subview_24 = memref.subview %subview_18[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:            %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:            %c16 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_25 to %c16 step %c1_26 {
-# CHECK-NEXT:              %subview_27 = memref.subview %subview_22[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_28 = memref.subview %subview_23[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:              %subview_29 = memref.subview %subview_24[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:              %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:              %subview_21 = memref.subview %subview_16[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_22 = memref.subview %subview_17[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:              %subview_23 = memref.subview %subview_18[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:              %c0_24 = arith.constant 0 : index
 # CHECK-NEXT:              %c5 = arith.constant 5 : index
-# CHECK-NEXT:              %c1_31 = arith.constant 1 : index
-# CHECK-NEXT:              scf.for %arg7 = %c0_30 to %c5 step %c1_31 {
-# CHECK-NEXT:                %subview_32 = memref.subview %subview_27[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                %subview_33 = memref.subview %subview_28[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                %subview_34 = memref.subview %subview_29[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                %c0_35 = arith.constant 0 : index
-# CHECK-NEXT:                %c5_36 = arith.constant 5 : index
-# CHECK-NEXT:                %c1_37 = arith.constant 1 : index
-# CHECK-NEXT:                scf.for %arg8 = %c0_35 to %c5_36 step %c1_37 {
-# CHECK-NEXT:                  %subview_38 = memref.subview %subview_32[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_39 = memref.subview %subview_33[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_40 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %c0_41 = arith.constant 0 : index
+# CHECK-NEXT:              %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:              scf.for %arg7 = %c0_24 to %c5 step %c1_25 {
+# CHECK-NEXT:                %subview_26 = memref.subview %subview_21[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                %subview_27 = memref.subview %subview_22[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                %subview_28 = memref.subview %subview_23[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:                %c5_30 = arith.constant 5 : index
+# CHECK-NEXT:                %c1_31 = arith.constant 1 : index
+# CHECK-NEXT:                scf.for %arg8 = %c0_29 to %c5_30 step %c1_31 {
+# CHECK-NEXT:                  %subview_32 = memref.subview %subview_26[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_33 = memref.subview %subview_27[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_34 = memref.subview %subview_28[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %c0_35 = arith.constant 0 : index
 # CHECK-NEXT:                  %c3 = arith.constant 3 : index
-# CHECK-NEXT:                  %c1_42 = arith.constant 1 : index
-# CHECK-NEXT:                  scf.for %arg9 = %c0_41 to %c3 step %c1_42 {
-# CHECK-NEXT:                    %subview_43 = memref.subview %subview_38[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_44 = memref.subview %subview_39[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_45 = memref.subview %subview_40[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_44 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                    ^bb0(%in: f32, %in_46: f32, %out: f32):
-# CHECK-NEXT:                      %2 = arith.mulf %in, %in_46 : f32
+# CHECK-NEXT:                  %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:                  scf.for %arg9 = %c0_35 to %c3 step %c1_36 {
+# CHECK-NEXT:                    %subview_37 = memref.subview %subview_32[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_38 = memref.subview %subview_33[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_39 = memref.subview %subview_34[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_38 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                    ^bb0(%in: f32, %in_40: f32, %out: f32):
+# CHECK-NEXT:                      %2 = arith.mulf %in, %in_40 : f32
 # CHECK-NEXT:                      %3 = arith.addf %out, %2 : f32
 # CHECK-NEXT:                      linalg.yield %3 : f32
 # CHECK-NEXT:                    }
diff --git a/tests/filecheck/backends/padding/test_pad_matmul_unpad_mlir.py b/tests/filecheck/backends/padding/test_pad_matmul_unpad_mlir.py
index e9dace84..19057f8b 100644
--- a/tests/filecheck/backends/padding/test_pad_matmul_unpad_mlir.py
+++ b/tests/filecheck/backends/padding/test_pad_matmul_unpad_mlir.py
@@ -57,43 +57,28 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./b" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_1 "./h" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_3 "./b" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./h" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./h" : !transform.any_op
-# CHECK-NEXT:      %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./j" : !transform.any_op
-# CHECK-NEXT:      %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./k" : !transform.any_op
-# CHECK-NEXT:      %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./j" : !transform.any_op
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./k" : !transform.any_op
+# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./j" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -103,120 +88,84 @@
 # CHECK-NEXT:    func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c16 step %c1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
+# CHECK-NEXT:      linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>)
 # CHECK-NEXT:      %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c14 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c14 step %c1_1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c14 step %c1 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./h"}
 # CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:      %cst_3 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_5 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_4 to %c16_5 step %c1_6 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_2[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_B_pad_0_} ins(%cst_3 : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>)
+# CHECK-NEXT:      %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_4 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_3 to %c14_4 step %c1_5 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview_2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./h"}
 # CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %subview_7 = memref.subview %alloca_2[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:      %cst_7 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%alloca_6 : memref<16x16xf32>)
 # CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_9 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c14_9 step %c1_10 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview_7[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %alloca_11 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:      %cst_12 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_13 to %c16_14 step %c1_15 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_11[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_12 : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_17 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_16 to %c16_17 step %c1_18 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %alloca_2[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
-# CHECK-NEXT:        %subview_25 = memref.subview %alloca_11[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_26 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_27 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_28 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_26 to %c16_27 step %c1_28 {
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_23[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_30 = memref.subview %subview_24[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_31 = memref.subview %subview_25[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_32 = arith.constant 0 : index
-# CHECK-NEXT:          %c16_33 = arith.constant 16 : index
-# CHECK-NEXT:          %c1_34 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_32 to %c16_33 step %c1_34 {
-# CHECK-NEXT:            %subview_35 = memref.subview %subview_29[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            %subview_36 = memref.subview %subview_30[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            %subview_37 = memref.subview %subview_31[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_35, %subview_36 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_37 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %c16 = arith.constant 16 : index
+# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c16 step %c1_9 {
+# CHECK-NEXT:        %subview_14 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %alloca_0[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
+# CHECK-NEXT:        %subview_16 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:        %c16_18 = arith.constant 16 : index
+# CHECK-NEXT:        %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_17 to %c16_18 step %c1_19 {
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_14[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_21 = memref.subview %subview_15[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_22 = memref.subview %subview_16[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:          %c16_24 = arith.constant 16 : index
+# CHECK-NEXT:          %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_23 to %c16_24 step %c1_25 {
+# CHECK-NEXT:            %subview_26 = memref.subview %subview_20[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_27 = memref.subview %subview_21[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_28 = memref.subview %subview_22[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_26, %subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:          } {"./k"}
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview_19 = memref.subview %alloca_11[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:      %c0_20 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_21 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_22 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_20 to %c14_21 step %c1_22 {
-# CHECK-NEXT:        %subview_23 = memref.subview %subview_19[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
+# CHECK-NEXT:      %subview_10 = memref.subview %alloca_6[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_12 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_11 to %c14_12 step %c1_13 {
+# CHECK-NEXT:        %subview_14 = memref.subview %subview_10[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
 # CHECK-NEXT:      return
diff --git a/tests/filecheck/backends/padding/test_pad_tuple_matmul_unpad_mlir.py b/tests/filecheck/backends/padding/test_pad_tuple_matmul_unpad_mlir.py
index fdbcdbed..f2a1d466 100644
--- a/tests/filecheck/backends/padding/test_pad_tuple_matmul_unpad_mlir.py
+++ b/tests/filecheck/backends/padding/test_pad_tuple_matmul_unpad_mlir.py
@@ -57,43 +57,28 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./b" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_1 "./h" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_3 "./b" : !transform.any_op
 # CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./h" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./h" : !transform.any_op
-# CHECK-NEXT:      %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./j" : !transform.any_op
-# CHECK-NEXT:      %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_23 "./k" : !transform.any_op
-# CHECK-NEXT:      %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_25 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_27 "./j" : !transform.any_op
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./k" : !transform.any_op
+# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_15 "./j" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -103,120 +88,84 @@
 # CHECK-NEXT:    func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c16 step %c1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
+# CHECK-NEXT:      linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>)
 # CHECK-NEXT:      %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c14 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c14 step %c1_1 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c14 step %c1 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg0[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_A_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./h"}
 # CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:      %cst_3 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_5 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_4 to %c16_5 step %c1_6 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_2[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_B_pad_0_} ins(%cst_3 : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:      %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>)
+# CHECK-NEXT:      %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_4 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_3 to %c14_4 step %c1_5 {
+# CHECK-NEXT:        %subview_14 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %subview_2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_19 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./h"}
 # CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %subview_7 = memref.subview %alloca_2[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:      %cst_7 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%alloca_6 : memref<16x16xf32>)
 # CHECK-NEXT:      %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_9 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c14_9 step %c1_10 {
-# CHECK-NEXT:        %subview_23 = memref.subview %arg1[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %subview_7[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_B_pad_} ins(%subview_28 : memref<1x1xf32, strided<[14, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %alloca_11 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:      %cst_12 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_13 to %c16_14 step %c1_15 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca_11[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:          %subview_27 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_12 : f32) outs(%subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:      %c16_17 = arith.constant 16 : index
-# CHECK-NEXT:      %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_16 to %c16_17 step %c1_18 {
-# CHECK-NEXT:        %subview_23 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %alloca_2[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
-# CHECK-NEXT:        %subview_25 = memref.subview %alloca_11[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_26 = arith.constant 0 : index
-# CHECK-NEXT:        %c16_27 = arith.constant 16 : index
-# CHECK-NEXT:        %c1_28 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_26 to %c16_27 step %c1_28 {
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_23[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_30 = memref.subview %subview_24[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_31 = memref.subview %subview_25[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_32 = arith.constant 0 : index
-# CHECK-NEXT:          %c16_33 = arith.constant 16 : index
-# CHECK-NEXT:          %c1_34 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_32 to %c16_33 step %c1_34 {
-# CHECK-NEXT:            %subview_35 = memref.subview %subview_29[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            %subview_36 = memref.subview %subview_30[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            %subview_37 = memref.subview %subview_31[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_35, %subview_36 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_37 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:      %c16 = arith.constant 16 : index
+# CHECK-NEXT:      %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_8 to %c16 step %c1_9 {
+# CHECK-NEXT:        %subview_14 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %alloca_0[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
+# CHECK-NEXT:        %subview_16 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:        %c16_18 = arith.constant 16 : index
+# CHECK-NEXT:        %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_17 to %c16_18 step %c1_19 {
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_14[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_21 = memref.subview %subview_15[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_22 = memref.subview %subview_16[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:          %c16_24 = arith.constant 16 : index
+# CHECK-NEXT:          %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_23 to %c16_24 step %c1_25 {
+# CHECK-NEXT:            %subview_26 = memref.subview %subview_20[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_27 = memref.subview %subview_21[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            %subview_28 = memref.subview %subview_22[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:            linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_26, %subview_27 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:          } {"./k"}
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %subview_19 = memref.subview %alloca_11[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:      %c0_20 = arith.constant 0 : index
-# CHECK-NEXT:      %c14_21 = arith.constant 14 : index
-# CHECK-NEXT:      %c1_22 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_20 to %c14_21 step %c1_22 {
-# CHECK-NEXT:        %subview_23 = memref.subview %subview_19[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:        %subview_24 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:        %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:        %c14_26 = arith.constant 14 : index
-# CHECK-NEXT:        %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_25 to %c14_26 step %c1_27 {
-# CHECK-NEXT:          %subview_28 = memref.subview %subview_23[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:          %subview_29 = memref.subview %subview_24[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
-# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
+# CHECK-NEXT:      %subview_10 = memref.subview %alloca_6[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:      %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:      %c14_12 = arith.constant 14 : index
+# CHECK-NEXT:      %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0_11 to %c14_12 step %c1_13 {
+# CHECK-NEXT:        %subview_14 = memref.subview %subview_10[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32, strided<[16, 1]>> to memref<1x14xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:        %subview_15 = memref.subview %arg2[%arg3, 0] [1, 14] [1, 1] : memref<14x14xf32> to memref<1x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:        %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:        %c14_17 = arith.constant 14 : index
+# CHECK-NEXT:        %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_16 to %c14_17 step %c1_18 {
+# CHECK-NEXT:          %subview_19 = memref.subview %subview_14[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:          %subview_20 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x14xf32, strided<[14, 1], offset: ?>> to memref<1x1xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:          linalg.copy {__xtc_id_C_} ins(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_20 : memref<1x1xf32, strided<[14, 1], offset: ?>>)
 # CHECK-NEXT:        } {"./j"}
 # CHECK-NEXT:      } {"./i"}
 # CHECK-NEXT:      return
diff --git a/tests/filecheck/backends/test_conv2d_mini_mlir.py b/tests/filecheck/backends/test_conv2d_mini_mlir.py
index 08d35c51..4ede3d0d 100644
--- a/tests/filecheck/backends/test_conv2d_mini_mlir.py
+++ b/tests/filecheck/backends/test_conv2d_mini_mlir.py
@@ -51,30 +51,21 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_1 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_3 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./f" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./f" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./r" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./s" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./c" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./r" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./s" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./c" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -86,83 +77,59 @@
 # CHECK-NEXT:  module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:    func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>)
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
 # CHECK-NEXT:      %c1_0 = arith.constant 1 : index
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1_0 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:        %c8 = arith.constant 8 : index
-# CHECK-NEXT:        %c1_5 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_4 to %c8 step %c1_5 {
-# CHECK-NEXT:          %subview_6 = memref.subview %subview[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_7 = arith.constant 0 : index
-# CHECK-NEXT:          %c8_8 = arith.constant 8 : index
-# CHECK-NEXT:          %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_7 to %c8_8 step %c1_9 {
-# CHECK-NEXT:            %subview_10 = memref.subview %subview_6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:            %c0_11 = arith.constant 0 : index
-# CHECK-NEXT:            %c16 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_12 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_11 to %c16 step %c1_12 {
-# CHECK-NEXT:              %subview_13 = memref.subview %subview_10[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./f"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
-# CHECK-NEXT:      %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:      %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:      %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:        %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:        %subview_5 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:        %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:        %subview_1 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:        %subview_2 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:        %c0_3 = arith.constant 0 : index
 # CHECK-NEXT:        %c8 = arith.constant 8 : index
-# CHECK-NEXT:        %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_6 to %c8 step %c1_7 {
-# CHECK-NEXT:          %subview_8 = memref.subview %subview[0, %arg4, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:          %subview_9 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:          %subview_10 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:          %c0_11 = arith.constant 0 : index
-# CHECK-NEXT:          %c8_12 = arith.constant 8 : index
-# CHECK-NEXT:          %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 {
-# CHECK-NEXT:            %subview_14 = memref.subview %subview_8[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:            %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:            %subview_16 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:            %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:        %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_3 to %c8 step %c1_4 {
+# CHECK-NEXT:          %subview_5 = memref.subview %subview[0, %arg4, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:          %subview_6 = memref.subview %subview_1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:          %subview_7 = memref.subview %subview_2[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:          %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:          %c8_9 = arith.constant 8 : index
+# CHECK-NEXT:          %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_8 to %c8_9 step %c1_10 {
+# CHECK-NEXT:            %subview_11 = memref.subview %subview_5[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:            %subview_12 = memref.subview %subview_6[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:            %subview_13 = memref.subview %subview_7[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:            %c0_14 = arith.constant 0 : index
 # CHECK-NEXT:            %c16 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_17 to %c16 step %c1_18 {
-# CHECK-NEXT:              %subview_19 = memref.subview %subview_14[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:              %subview_20 = memref.subview %subview_15[0, 0, 0, %arg6] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:              %subview_21 = memref.subview %subview_16[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:              %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:            %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_14 to %c16 step %c1_15 {
+# CHECK-NEXT:              %subview_16 = memref.subview %subview_11[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:              %subview_17 = memref.subview %subview_12[0, 0, 0, %arg6] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:              %subview_18 = memref.subview %subview_13[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:              %c0_19 = arith.constant 0 : index
 # CHECK-NEXT:              %c3 = arith.constant 3 : index
-# CHECK-NEXT:              %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:              scf.for %arg7 = %c0_22 to %c3 step %c1_23 {
-# CHECK-NEXT:                %subview_24 = memref.subview %subview_19[0, %arg7, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:                %subview_25 = memref.subview %subview_20[%arg7, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                %subview_26 = memref.subview %subview_21[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                %c0_27 = arith.constant 0 : index
-# CHECK-NEXT:                %c3_28 = arith.constant 3 : index
-# CHECK-NEXT:                %c1_29 = arith.constant 1 : index
-# CHECK-NEXT:                scf.for %arg8 = %c0_27 to %c3_28 step %c1_29 {
-# CHECK-NEXT:                  %subview_30 = memref.subview %subview_24[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_31 = memref.subview %subview_25[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %subview_32 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                  %c0_33 = arith.constant 0 : index
-# CHECK-NEXT:                  %c3_34 = arith.constant 3 : index
-# CHECK-NEXT:                  %c1_35 = arith.constant 1 : index
-# CHECK-NEXT:                  scf.for %arg9 = %c0_33 to %c3_34 step %c1_35 {
-# CHECK-NEXT:                    %subview_36 = memref.subview %subview_30[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_37 = memref.subview %subview_31[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                    %subview_38 = memref.subview %subview_32[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_36, %subview_37 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                    ^bb0(%in: f32, %in_39: f32, %out: f32):
-# CHECK-NEXT:                      %0 = arith.mulf %in, %in_39 : f32
+# CHECK-NEXT:              %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:              scf.for %arg7 = %c0_19 to %c3 step %c1_20 {
+# CHECK-NEXT:                %subview_21 = memref.subview %subview_16[0, %arg7, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                %subview_22 = memref.subview %subview_17[%arg7, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                %subview_23 = memref.subview %subview_18[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:                %c3_25 = arith.constant 3 : index
+# CHECK-NEXT:                %c1_26 = arith.constant 1 : index
+# CHECK-NEXT:                scf.for %arg8 = %c0_24 to %c3_25 step %c1_26 {
+# CHECK-NEXT:                  %subview_27 = memref.subview %subview_21[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_28 = memref.subview %subview_22[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %subview_29 = memref.subview %subview_23[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                  %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:                  %c3_31 = arith.constant 3 : index
+# CHECK-NEXT:                  %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:                  scf.for %arg9 = %c0_30 to %c3_31 step %c1_32 {
+# CHECK-NEXT:                    %subview_33 = memref.subview %subview_27[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_34 = memref.subview %subview_28[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                    %subview_35 = memref.subview %subview_29[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                    linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_33, %subview_34 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_35 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                    ^bb0(%in: f32, %in_36: f32, %out: f32):
+# CHECK-NEXT:                      %0 = arith.mulf %in, %in_36 : f32
 # CHECK-NEXT:                      %1 = arith.addf %out, %0 : f32
 # CHECK-NEXT:                      linalg.yield %1 : f32
 # CHECK-NEXT:                    }
diff --git a/tests/filecheck/backends/test_conv2d_r181_mlir.py b/tests/filecheck/backends/test_conv2d_r181_mlir.py
index ddfde1cd..b0116de6 100644
--- a/tests/filecheck/backends/test_conv2d_r181_mlir.py
+++ b/tests/filecheck/backends/test_conv2d_r181_mlir.py
@@ -60,41 +60,32 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_1 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_3 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./f" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./f" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./r" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./s" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./c" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./w1" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_20) : (!transform.any_op) -> ()
-# CHECK-NEXT:      transform.loop.unroll %loops_21 {factor = 4 : i64} : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_19 {factor = 3 : i64} : !transform.any_op
-# CHECK-NEXT:      %2 = transform.get_parent_op %loops_7 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./r" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./s" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./c" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./w1" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_12) : (!transform.any_op) -> ()
+# CHECK-NEXT:      transform.loop.unroll %loops_13 {factor = 4 : i64} : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_11 {factor = 3 : i64} : !transform.any_op
+# CHECK-NEXT:      %1 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
 # CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
 # CHECK-NEXT:      } : !transform.any_op
@@ -114,25 +105,13 @@
 # CHECK-NEXT:      %c2 = arith.constant 2 : index
 # CHECK-NEXT:      %c7 = arith.constant 7 : index
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c4 = arith.constant 4 : index
 # CHECK-NEXT:      %c64 = arith.constant 64 : index
+# CHECK-NEXT:      %c4 = arith.constant 4 : index
 # CHECK-NEXT:      %c112 = arith.constant 112 : index
-# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c112 step %c1 {
-# CHECK-NEXT:          %subview_0 = memref.subview %subview[0, %arg4, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:          scf.for %arg5 = %c0 to %c112 step %c1 {
-# CHECK-NEXT:            %subview_1 = memref.subview %subview_0[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:            scf.for %arg6 = %c0 to %c64 step %c1 {
-# CHECK-NEXT:              %subview_2 = memref.subview %subview_1[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:              linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_2 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>)
-# CHECK-NEXT:            } {"./f"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x112x112x64xf32>)
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
 # CHECK-NEXT:        %subview_0 = memref.subview %arg1[0, 0, 0, 0] [7, 7, 3, 64] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x64xf32, strided<[1344, 192, 64, 1]>>
diff --git a/tests/filecheck/backends/test_conv2d_r181_mlir_sv.py b/tests/filecheck/backends/test_conv2d_r181_mlir_sv.py
index 8f2df57a..7a75971f 100644
--- a/tests/filecheck/backends/test_conv2d_r181_mlir_sv.py
+++ b/tests/filecheck/backends/test_conv2d_r181_mlir_sv.py
@@ -1,5 +1,6 @@
 # RUN: python %s 2>&1 | filecheck %s
 # UNSUPPORTED: mlir-target=nvgpu
+# UNSUPPORTED: mlir-target=c
 
 import xtc.graphs.xtc.op as O
 from xtc.backends.mlir.MlirGraphBackend import MlirGraphBackend as Backend
@@ -67,44 +68,35 @@
 # CHECK-NEXT:      transform.yield %0 : !transform.any_op
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_1 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_3 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %loops_5 "./f" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./b" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./h" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./w" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "./f" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "./r" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "./s" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_19 "./c" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_21 "./w1" : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_21 {factor = 4 : i64} : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_19 {factor = 3 : i64} : !transform.any_op
-# CHECK-NEXT:      %2 = transform.get_parent_op %loops_7 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %3 = transform.apply_registered_pass "convert-linalg-to-affine-loops" to %2 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %4 = transform.include @_super_vectorize failures(suppress) (%3) : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.yield 
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./r" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "./s" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "./c" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "./w1" : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_13 {factor = 4 : i64} : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_11 {factor = 3 : i64} : !transform.any_op
+# CHECK-NEXT:      %1 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %2 = transform.apply_registered_pass "convert-linalg-to-affine-loops" to %1 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %3 = transform.include @_super_vectorize failures(suppress) (%2) : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.yield
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
 
 # CHECK: MLIR Error: NYI: non-trivial layout map
 
-# CHECK:  // -----// IR Dump After transform //----- //
+# CHECK:       // -----// IR Dump After transform //----- //
 # CHECK-NEXT:  #map = affine_map<(d0) -> (d0 * 2)>
 # CHECK-NEXT:  #map1 = affine_map<(d0, d1) -> (d0 * 2 + d1)>
 # CHECK-NEXT:  module attributes {transform.with_named_sequence} {
@@ -114,33 +106,22 @@
 # CHECK-NEXT:      %c2 = arith.constant 2 : index
 # CHECK-NEXT:      %c7 = arith.constant 7 : index
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c4 = arith.constant 4 : index
 # CHECK-NEXT:      %c64 = arith.constant 64 : index
+# CHECK-NEXT:      %c4 = arith.constant 4 : index
 # CHECK-NEXT:      %c112 = arith.constant 112 : index
-# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c112 step %c1 {
-# CHECK-NEXT:          %subview_0 = memref.subview %subview[0, %arg4, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:          scf.for %arg5 = %c0 to %c112 step %c1 {
-# CHECK-NEXT:            %subview_1 = memref.subview %subview_0[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:            scf.for %arg6 = %c0 to %c64 step %c1 {
-# CHECK-NEXT:              %subview_2 = memref.subview %subview_1[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:              affine.for %arg7 = 0 to 1 {
-# CHECK-NEXT:                affine.for %arg8 = 0 to 1 {
-# CHECK-NEXT:                  affine.for %arg9 = 0 to 1 {
-# CHECK-NEXT:                    affine.for %arg10 = 0 to 1 {
-# CHECK-NEXT:                      affine.store %cst, %subview_2[%arg7, %arg8, %arg9, %arg10] : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                    }
-# CHECK-NEXT:                  }
-# CHECK-NEXT:                }
-# CHECK-NEXT:              }
-# CHECK-NEXT:            } {"./f"}
-# CHECK-NEXT:          } {"./w"}
-# CHECK-NEXT:        } {"./h"}
-# CHECK-NEXT:      } {"./b"}
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      affine.for %arg3 = 0 to 1 {
+# CHECK-NEXT:        affine.for %arg4 = 0 to 112 {
+# CHECK-NEXT:          affine.for %arg5 = 0 to 112 {
+# CHECK-NEXT:            affine.for %arg6 = 0 to 64 step 16 {
+# CHECK-NEXT:              %cst_0 = arith.constant dense<0.000000e+00> : vector<16xf32>
+# CHECK-NEXT:              vector.transfer_write %cst_0, %arg2[%arg3, %arg4, %arg5, %arg6] : vector<16xf32>, memref<1x112x112x64xf32>
+# CHECK-NEXT:            }
+# CHECK-NEXT:          }
+# CHECK-NEXT:        }
+# CHECK-NEXT:      }
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c1 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
 # CHECK-NEXT:        %subview_0 = memref.subview %arg1[0, 0, 0, 0] [7, 7, 3, 64] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x64xf32, strided<[1344, 192, 64, 1]>>
diff --git a/tests/filecheck/backends/test_matmul_mlir.py b/tests/filecheck/backends/test_matmul_mlir.py
index 186bb45b..5323fa72 100644
--- a/tests/filecheck/backends/test_matmul_mlir.py
+++ b/tests/filecheck/backends/test_matmul_mlir.py
@@ -47,28 +47,23 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> ()
-# CHECK-NEXT:      transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
-# CHECK-NEXT:      %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./i1" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_4) : (!transform.any_op) -> ()
+# CHECK-NEXT:      transform.loop.unroll %loops_5 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %1 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
 # CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
 # CHECK-NEXT:      } : !transform.any_op
@@ -82,20 +77,14 @@
 # CHECK-NEXT:      %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
 # CHECK-NEXT:      %0 = ub.poison : f32
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c2 = arith.constant 2 : index
-# CHECK-NEXT:      %c512 = arith.constant 512 : index
 # CHECK-NEXT:      %c32 = arith.constant 32 : index
-# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %c2 = arith.constant 2 : index
 # CHECK-NEXT:      %c4 = arith.constant 4 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c32 step %c1 {
-# CHECK-NEXT:          %subview_1 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      %c512 = arith.constant 512 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
 # CHECK-NEXT:        %subview_1 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
diff --git a/tests/filecheck/backends/test_matmul_mlir_distributed.py b/tests/filecheck/backends/test_matmul_mlir_distributed.py
index ad535fbe..21e4176a 100644
--- a/tests/filecheck/backends/test_matmul_mlir_distributed.py
+++ b/tests/filecheck/backends/test_matmul_mlir_distributed.py
@@ -57,30 +57,25 @@
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
 # CHECK-NEXT:      %0 = transform.sdist.create_memory_mesh %arg0 "memory_mesh" = <["mx"=2, "my"=2]> : !transform.any_op -> !transform.any_op
 # CHECK-NEXT:      %1 = transform.sdist.create_processor_mesh %arg0 "processor_mesh" = <["px"=2, "py"=2, "psx"=2, "psy"=8]> from "memory_mesh" : !transform.any_op -> !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %4 = transform.sdist.distribute_buffer_at %3 tensor 1 with ["mx", "*"] on "memory_mesh" : !transform.any_op -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %3 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op_2 {
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %3 = transform.sdist.distribute_buffer_at %2 tensor 1 with ["mx", "*"] on "memory_mesh" : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op {
 # CHECK-NEXT:        transform.apply_patterns.memref.fold_memref_alias_ops
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      %5 = transform.sdist.local_buffer_at %tiled_linalg_op_2 tensor 1 : !transform.any_op -> !transform.any_op
-# CHECK-NEXT:      %tiled_op, %forall_op = transform.structured.tile_using_forall %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %4 = transform.sdist.local_buffer_at %tiled_linalg_op tensor 1 : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %tiled_op, %forall_op = transform.structured.tile_using_forall %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %forall_op "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./i1" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./j1" : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_7 {factor = 2 : i64} : !transform.any_op
-# CHECK-NEXT:      %6 = transform.sdist.distribute_loop %forall_op {axis = "px", mesh = "processor_mesh"} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.annotate %6 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./i1" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./j1" : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_3 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %transformed, %tiledOp = transform.sdist.distribute_loop %forall_op {axis = "px", mesh = "processor_mesh"} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %transformed "./i" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -92,75 +87,63 @@
 # CHECK-NEXT:    sdist.memory_mesh @memory_mesh = <["mx"=2, "my"=2]>
 # CHECK-NEXT:    func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c4 = arith.constant 4 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:        %c32 = arith.constant 32 : index
-# CHECK-NEXT:        %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c32 step %c1_3 {
-# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      %0 = sdist.distribute %arg1 ["mx", "*"] on @memory_mesh : memref<512x32xf32>, memref<512x32xf32, 1>
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %c512 = arith.constant 512 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c512 step %c1_1 {
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:        %subview_2 = memref.subview %0[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32, 1> to memref<1x32xf32, strided<[32, 1], offset: ?>, 1>
-# CHECK-NEXT:        %subview_3 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:        %subview_0 = memref.subview %0[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32, 1> to memref<1x32xf32, strided<[32, 1], offset: ?>, 1>
+# CHECK-NEXT:        %subview_1 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
 # CHECK-NEXT:        %alloc = memref.alloc() : memref<1x32xf32, 2>
-# CHECK-NEXT:        %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:        sdist.read %0[%arg3, %c0_4] to %alloc : memref<512x32xf32, 1>, memref<1x32xf32, 2>
-# CHECK-NEXT:        %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:        sdist.read %0[%arg3, %c0_2] to %alloc : memref<512x32xf32, 1>, memref<1x32xf32, 2>
+# CHECK-NEXT:        %c0_3 = arith.constant 0 : index
 # CHECK-NEXT:        %c2 = arith.constant 2 : index
-# CHECK-NEXT:        %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:        sdist.for_distributed %arg4 = %c0_5 to %c2 step [%c1_6] on @processor_mesh("px") {
+# CHECK-NEXT:        %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:        sdist.for_distributed %arg4 = %c0_3 to %c2 step [%c1_4] on @processor_mesh("px") {
 # CHECK-NEXT:          %1 = affine.apply #map(%arg4)
-# CHECK-NEXT:          %subview_7 = memref.subview %subview[%1, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:          %subview_8 = memref.subview %alloc[0, 0] [1, 32] [1, 1] : memref<1x32xf32, 2> to memref<1x32xf32, strided<[32, 1]>, 2>
-# CHECK-NEXT:          %subview_9 = memref.subview %subview_3[%1, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:          %subview_5 = memref.subview %subview[%1, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:          %subview_6 = memref.subview %alloc[0, 0] [1, 32] [1, 1] : memref<1x32xf32, 2> to memref<1x32xf32, strided<[32, 1]>, 2>
+# CHECK-NEXT:          %subview_7 = memref.subview %subview_1[%1, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:          %c32 = arith.constant 32 : index
 # CHECK-NEXT:          %c16 = arith.constant 16 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_10 to %c32 step %c16 {
-# CHECK-NEXT:            %subview_11 = memref.subview %subview_7[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_12 = memref.subview %subview_8[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1]>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:            %subview_13 = memref.subview %subview_9[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:          scf.for %arg5 = %c0_8 to %c32 step %c16 {
+# CHECK-NEXT:            %subview_9 = memref.subview %subview_5[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_10 = memref.subview %subview_6[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1]>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:            %subview_11 = memref.subview %subview_7[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:            %c2_13 = arith.constant 2 : index
+# CHECK-NEXT:            %c1_14 = arith.constant 1 : index
 # CHECK-NEXT:            %c2_15 = arith.constant 2 : index
-# CHECK-NEXT:            %c1_16 = arith.constant 1 : index
-# CHECK-NEXT:            %c2_17 = arith.constant 2 : index
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_11[%c0_14, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_19 = memref.subview %subview_12[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:            %subview_20 = memref.subview %subview_13[%c0_14, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_21 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_22 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_21 to %c16_22 step %c1_23 {
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_18[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_19[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:              %subview_33 = memref.subview %subview_20[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_31, %subview_32 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_33 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_9[%c0_12, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_17 = memref.subview %subview_10[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:            %subview_18 = memref.subview %subview_11[%c0_12, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_20 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_21 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_19 to %c16_20 step %c1_21 {
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_30 = memref.subview %subview_17[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:              %subview_31 = memref.subview %subview_18[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_29, %subview_30 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_31 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
-# CHECK-NEXT:            %c1_24 = arith.constant 1 : index
-# CHECK-NEXT:            %2 = arith.muli %c1_16, %c1_24 : index
-# CHECK-NEXT:            %3 = arith.addi %c0_14, %2 : index
-# CHECK-NEXT:            %subview_25 = memref.subview %subview_11[%3, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_26 = memref.subview %subview_12[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:            %subview_27 = memref.subview %subview_13[%3, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_28 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_29 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_30 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_28 to %c16_29 step %c1_30 {
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_25[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_26[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:              %subview_33 = memref.subview %subview_27[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_31, %subview_32 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_33 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:            %2 = arith.muli %c1_14, %c1_22 : index
+# CHECK-NEXT:            %3 = arith.addi %c0_12, %2 : index
+# CHECK-NEXT:            %subview_23 = memref.subview %subview_9[%3, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_24 = memref.subview %subview_10[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:            %subview_25 = memref.subview %subview_11[%3, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_26 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_27 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_28 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_26 to %c16_27 step %c1_28 {
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_23[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_30 = memref.subview %subview_24[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:              %subview_31 = memref.subview %subview_25[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_29, %subview_30 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_31 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
 # CHECK-NEXT:          } {"./j"}
 # CHECK-NEXT:          sdist.terminator
diff --git a/tests/filecheck/backends/test_matmul_mlir_parallel.py b/tests/filecheck/backends/test_matmul_mlir_parallel.py
index a492d967..13743673 100644
--- a/tests/filecheck/backends/test_matmul_mlir_parallel.py
+++ b/tests/filecheck/backends/test_matmul_mlir_parallel.py
@@ -43,27 +43,22 @@
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
 # CHECK-NEXT:      transform.structured.vectorize %arg0 : !transform.any_op
-# CHECK-NEXT:      transform.yield
+# CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      %tiled_op, %forall_op = transform.structured.tile_using_forall %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      %tiled_op, %forall_op = transform.structured.tile_using_forall %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:      transform.annotate %forall_op "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_op_4, %forall_op_5 = transform.structured.tile_using_forall %tiled_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %forall_op_5 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_op_4 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./i1" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./j1" : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_7 {factor = 2 : i64} : !transform.any_op
-# CHECK-NEXT:      transform.yield
+# CHECK-NEXT:      %tiled_op_0, %forall_op_1 = transform.structured.tile_using_forall %tiled_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %forall_op_1 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_op_0 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./i1" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./j1" : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_3 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
 # CHECK-NEXT:  
@@ -73,66 +68,54 @@
 # CHECK-NEXT:  module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:    func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c4 = arith.constant 4 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:        %c32 = arith.constant 32 : index
-# CHECK-NEXT:        %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c32 step %c1_3 {
-# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
 # CHECK-NEXT:      %c512 = arith.constant 512 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c512 step %c1_1 {
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:        %subview_2 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %subview_3 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:        %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:        %subview_1 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
 # CHECK-NEXT:        scf.forall (%arg4) in (2) {
 # CHECK-NEXT:          %0 = affine.apply #map(%arg4)
-# CHECK-NEXT:          %subview_4 = memref.subview %subview[%0, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:          %subview_5 = memref.subview %subview_2[0, 0] [1, 32] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          %subview_6 = memref.subview %subview_3[%0, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %subview_2 = memref.subview %subview[%0, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:          %subview_3 = memref.subview %subview_0[0, 0] [1, 32] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %subview_4 = memref.subview %subview_1[%0, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:          scf.forall (%arg5) in (2) {
 # CHECK-NEXT:            %1 = affine.apply #map1(%arg5)
-# CHECK-NEXT:            %subview_7 = memref.subview %subview_4[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_8 = memref.subview %subview_5[0, %1] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_9 = memref.subview %subview_6[0, %1] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:            %subview_5 = memref.subview %subview_2[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_6 = memref.subview %subview_3[0, %1] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_7 = memref.subview %subview_4[0, %1] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:            %c2 = arith.constant 2 : index
-# CHECK-NEXT:            %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:            %c2_12 = arith.constant 2 : index
-# CHECK-NEXT:            %subview_13 = memref.subview %subview_7[%c0_10, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_14 = memref.subview %subview_8[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_15 = memref.subview %subview_9[%c0_10, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:            %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:            %c2_10 = arith.constant 2 : index
+# CHECK-NEXT:            %subview_11 = memref.subview %subview_5[%c0_8, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_12 = memref.subview %subview_6[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_13 = memref.subview %subview_7[%c0_8, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_14 = arith.constant 0 : index
 # CHECK-NEXT:            %c16 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_16 to %c16 step %c1_17 {
-# CHECK-NEXT:              %subview_25 = memref.subview %subview_13[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_26 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              %subview_27 = memref.subview %subview_15[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_25, %subview_26 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_27 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_14 to %c16 step %c1_15 {
+# CHECK-NEXT:              %subview_23 = memref.subview %subview_11[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_24 = memref.subview %subview_12[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_25 = memref.subview %subview_13[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_23, %subview_24 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_25 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
-# CHECK-NEXT:            %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:            %2 = arith.muli %c1_11, %c1_18 : index
-# CHECK-NEXT:            %3 = arith.addi %c0_10, %2 : index
-# CHECK-NEXT:            %subview_19 = memref.subview %subview_7[%3, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_20 = memref.subview %subview_8[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_21 = memref.subview %subview_9[%3, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_22 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_23 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_24 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_22 to %c16_23 step %c1_24 {
-# CHECK-NEXT:              %subview_25 = memref.subview %subview_19[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_26 = memref.subview %subview_20[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              %subview_27 = memref.subview %subview_21[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_25, %subview_26 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_27 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_16 = arith.constant 1 : index
+# CHECK-NEXT:            %2 = arith.muli %c1_9, %c1_16 : index
+# CHECK-NEXT:            %3 = arith.addi %c0_8, %2 : index
+# CHECK-NEXT:            %subview_17 = memref.subview %subview_5[%3, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_18 = memref.subview %subview_6[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_19 = memref.subview %subview_7[%3, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_21 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_20 to %c16_21 step %c1_22 {
+# CHECK-NEXT:              %subview_23 = memref.subview %subview_17[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_24 = memref.subview %subview_18[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_25 = memref.subview %subview_19[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_23, %subview_24 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_25 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
 # CHECK-NEXT:          } {"./j"}
 # CHECK-NEXT:        } {"./i"}
diff --git a/tests/filecheck/backends/test_matmul_ndiv_mlir.py b/tests/filecheck/backends/test_matmul_ndiv_mlir.py
index 767b7fdf..0345410e 100644
--- a/tests/filecheck/backends/test_matmul_ndiv_mlir.py
+++ b/tests/filecheck/backends/test_matmul_ndiv_mlir.py
@@ -47,28 +47,23 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [3, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> ()
-# CHECK-NEXT:      transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
-# CHECK-NEXT:      %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [3, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./i1" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_4) : (!transform.any_op) -> ()
+# CHECK-NEXT:      transform.loop.unroll %loops_5 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %1 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
 # CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
 # CHECK-NEXT:      } : !transform.any_op
@@ -84,20 +79,14 @@
 # CHECK-NEXT:      %0 = ub.poison : f32
 # CHECK-NEXT:      %c2 = arith.constant 2 : index
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c3 = arith.constant 3 : index
-# CHECK-NEXT:      %c512 = arith.constant 512 : index
 # CHECK-NEXT:      %c32 = arith.constant 32 : index
-# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %c3 = arith.constant 3 : index
 # CHECK-NEXT:      %c4 = arith.constant 4 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c32 step %c1 {
-# CHECK-NEXT:          %subview_1 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      %c512 = arith.constant 512 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
 # CHECK-NEXT:        %subview_1 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
diff --git a/tests/filecheck/backends/test_matmul_relu_mlir.py b/tests/filecheck/backends/test_matmul_relu_mlir.py
index 824c419a..ac1a4fff 100644
--- a/tests/filecheck/backends/test_matmul_relu_mlir.py
+++ b/tests/filecheck/backends/test_matmul_relu_mlir.py
@@ -59,34 +59,29 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_matmul_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> ()
-# CHECK-NEXT:      transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
-# CHECK-NEXT:      %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./i1" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_4) : (!transform.any_op) -> ()
+# CHECK-NEXT:      transform.loop.unroll %loops_5 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %1 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
 # CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      %3 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./i" : !transform.any_op
+# CHECK-NEXT:      %2 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./i" : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -99,21 +94,15 @@
 # CHECK-NEXT:      %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
 # CHECK-NEXT:      %0 = ub.poison : f32
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c2 = arith.constant 2 : index
-# CHECK-NEXT:      %c512 = arith.constant 512 : index
 # CHECK-NEXT:      %c32 = arith.constant 32 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      %c2 = arith.constant 2 : index
 # CHECK-NEXT:      %c4 = arith.constant 4 : index
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      %c512 = arith.constant 512 : index
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
 # CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:      %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c32 step %c1 {
-# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
 # CHECK-NEXT:        %subview_4 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
diff --git a/tests/filecheck/backends/test_matmul_scalar_mlir.py b/tests/filecheck/backends/test_matmul_scalar_mlir.py
index 6ffde280..bd3714ff 100644
--- a/tests/filecheck/backends/test_matmul_scalar_mlir.py
+++ b/tests/filecheck/backends/test_matmul_scalar_mlir.py
@@ -45,23 +45,18 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./j1" : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./i1" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./j1" : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_5 {factor = 2 : i64} : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -70,70 +65,58 @@
 # CHECK-NEXT:  module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:    func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c4 = arith.constant 4 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:        %c32 = arith.constant 32 : index
-# CHECK-NEXT:        %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c32 step %c1_3 {
-# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
 # CHECK-NEXT:      %c512 = arith.constant 512 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c512 step %c1_1 {
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:        %subview_2 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %subview_3 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:        %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:        %c4_5 = arith.constant 4 : index
+# CHECK-NEXT:        %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:        %subview_1 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:        %c4 = arith.constant 4 : index
 # CHECK-NEXT:        %c2 = arith.constant 2 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_4 to %c4_5 step %c2 {
-# CHECK-NEXT:          %subview_6 = memref.subview %subview[%arg4, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:          %subview_7 = memref.subview %subview_2[0, 0] [1, 32] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          %subview_8 = memref.subview %subview_3[%arg4, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c4 step %c2 {
+# CHECK-NEXT:          %subview_3 = memref.subview %subview[%arg4, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:          %subview_4 = memref.subview %subview_0[0, 0] [1, 32] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %subview_5 = memref.subview %subview_1[%arg4, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %c0_6 = arith.constant 0 : index
 # CHECK-NEXT:          %c32 = arith.constant 32 : index
 # CHECK-NEXT:          %c16 = arith.constant 16 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_9 to %c32 step %c16 {
-# CHECK-NEXT:            %subview_10 = memref.subview %subview_6[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_11 = memref.subview %subview_7[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_12 = memref.subview %subview_8[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:            %c2_14 = arith.constant 2 : index
-# CHECK-NEXT:            %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:            %c2_16 = arith.constant 2 : index
-# CHECK-NEXT:            %subview_17 = memref.subview %subview_10[%c0_13, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_11[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_19 = memref.subview %subview_12[%c0_13, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_20 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_21 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_22 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_20 to %c16_21 step %c1_22 {
-# CHECK-NEXT:              %subview_30 = memref.subview %subview_17[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_18[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_19[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_30, %subview_31 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_32 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:          scf.for %arg5 = %c0_6 to %c32 step %c16 {
+# CHECK-NEXT:            %subview_7 = memref.subview %subview_3[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_8 = memref.subview %subview_4[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_9 = memref.subview %subview_5[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:            %c2_11 = arith.constant 2 : index
+# CHECK-NEXT:            %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:            %c2_13 = arith.constant 2 : index
+# CHECK-NEXT:            %subview_14 = memref.subview %subview_7[%c0_10, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_8[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_9[%c0_10, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_18 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_17 to %c16_18 step %c1_19 {
+# CHECK-NEXT:              %subview_27 = memref.subview %subview_14[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_15[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_16[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            %0 = arith.muli %c1_15, %c1_23 : index
-# CHECK-NEXT:            %1 = arith.addi %c0_13, %0 : index
-# CHECK-NEXT:            %subview_24 = memref.subview %subview_10[%1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_25 = memref.subview %subview_11[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_26 = memref.subview %subview_12[%1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_27 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_28 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_29 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_27 to %c16_28 step %c1_29 {
-# CHECK-NEXT:              %subview_30 = memref.subview %subview_24[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_25[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_26[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_30, %subview_31 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_32 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:            %0 = arith.muli %c1_12, %c1_20 : index
+# CHECK-NEXT:            %1 = arith.addi %c0_10, %0 : index
+# CHECK-NEXT:            %subview_21 = memref.subview %subview_7[%1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_22 = memref.subview %subview_8[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_23 = memref.subview %subview_9[%1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_25 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_26 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_24 to %c16_25 step %c1_26 {
+# CHECK-NEXT:              %subview_27 = memref.subview %subview_21[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_22[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_23[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
 # CHECK-NEXT:          } {"./j"}
 # CHECK-NEXT:        } {"./i"}
diff --git a/tests/filecheck/backends/test_mlir_pack_no_sdist.py b/tests/filecheck/backends/test_mlir_pack_no_sdist.py
index 2aea7e61..ca7e08ce 100644
--- a/tests/filecheck/backends/test_mlir_pack_no_sdist.py
+++ b/tests/filecheck/backends/test_mlir_pack_no_sdist.py
@@ -47,26 +47,21 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op_2 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op {
 # CHECK-NEXT:        transform.apply_patterns.memref.fold_memref_alias_ops
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./j1" : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./i1" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./j1" : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_5 {factor = 2 : i64} : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -75,70 +70,58 @@
 # CHECK-NEXT:  module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:    func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c4 = arith.constant 4 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:        %c32 = arith.constant 32 : index
-# CHECK-NEXT:        %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c32 step %c1_3 {
-# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
 # CHECK-NEXT:      %c512 = arith.constant 512 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c512 step %c1_1 {
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:        %subview_2 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %subview_3 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:        %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:        %c4_5 = arith.constant 4 : index
+# CHECK-NEXT:        %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:        %subview_1 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:        %c4 = arith.constant 4 : index
 # CHECK-NEXT:        %c2 = arith.constant 2 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_4 to %c4_5 step %c2 {
-# CHECK-NEXT:          %subview_6 = memref.subview %subview[%arg4, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:          %subview_7 = memref.subview %subview_2[0, 0] [1, 32] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          %subview_8 = memref.subview %subview_3[%arg4, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c4 step %c2 {
+# CHECK-NEXT:          %subview_3 = memref.subview %subview[%arg4, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:          %subview_4 = memref.subview %subview_0[0, 0] [1, 32] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %subview_5 = memref.subview %subview_1[%arg4, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %c0_6 = arith.constant 0 : index
 # CHECK-NEXT:          %c32 = arith.constant 32 : index
 # CHECK-NEXT:          %c16 = arith.constant 16 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_9 to %c32 step %c16 {
-# CHECK-NEXT:            %subview_10 = memref.subview %subview_6[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_11 = memref.subview %subview_7[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_12 = memref.subview %subview_8[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:            %c2_14 = arith.constant 2 : index
-# CHECK-NEXT:            %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:            %c2_16 = arith.constant 2 : index
-# CHECK-NEXT:            %subview_17 = memref.subview %subview_10[%c0_13, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_11[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_19 = memref.subview %subview_12[%c0_13, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_20 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_21 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_22 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_20 to %c16_21 step %c1_22 {
-# CHECK-NEXT:              %subview_30 = memref.subview %subview_17[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_18[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_19[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_30, %subview_31 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_32 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:          scf.for %arg5 = %c0_6 to %c32 step %c16 {
+# CHECK-NEXT:            %subview_7 = memref.subview %subview_3[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_8 = memref.subview %subview_4[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_9 = memref.subview %subview_5[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:            %c2_11 = arith.constant 2 : index
+# CHECK-NEXT:            %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:            %c2_13 = arith.constant 2 : index
+# CHECK-NEXT:            %subview_14 = memref.subview %subview_7[%c0_10, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_8[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_9[%c0_10, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_18 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_17 to %c16_18 step %c1_19 {
+# CHECK-NEXT:              %subview_27 = memref.subview %subview_14[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_15[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_16[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            %0 = arith.muli %c1_15, %c1_23 : index
-# CHECK-NEXT:            %1 = arith.addi %c0_13, %0 : index
-# CHECK-NEXT:            %subview_24 = memref.subview %subview_10[%1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_25 = memref.subview %subview_11[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %subview_26 = memref.subview %subview_12[%1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_27 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_28 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_29 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_27 to %c16_28 step %c1_29 {
-# CHECK-NEXT:              %subview_30 = memref.subview %subview_24[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_25[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_26[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_30, %subview_31 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_32 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:            %0 = arith.muli %c1_12, %c1_20 : index
+# CHECK-NEXT:            %1 = arith.addi %c0_10, %0 : index
+# CHECK-NEXT:            %subview_21 = memref.subview %subview_7[%1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_22 = memref.subview %subview_8[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %subview_23 = memref.subview %subview_9[%1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_25 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_26 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_24 to %c16_25 step %c1_26 {
+# CHECK-NEXT:              %subview_27 = memref.subview %subview_21[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_22[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_23[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
 # CHECK-NEXT:          } {"./j"}
 # CHECK-NEXT:        } {"./i"}
diff --git a/tests/filecheck/backends/test_mlir_pack_sdist.py b/tests/filecheck/backends/test_mlir_pack_sdist.py
index 2be29080..4473d776 100644
--- a/tests/filecheck/backends/test_mlir_pack_sdist.py
+++ b/tests/filecheck/backends/test_mlir_pack_sdist.py
@@ -47,27 +47,22 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "./k" : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op_2 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "./k" : !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %tiled_linalg_op {
 # CHECK-NEXT:        transform.apply_patterns.memref.fold_memref_alias_ops
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      %2 = transform.sdist.local_buffer_at %tiled_linalg_op_2 tensor 1 : !transform.any_op -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "./j" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "./i1" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "./j1" : !transform.any_op
-# CHECK-NEXT:      transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %1 = transform.sdist.local_buffer_at %tiled_linalg_op tensor 1 : !transform.any_op -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "./i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "./j" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "./i1" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "./j1" : !transform.any_op
+# CHECK-NEXT:      transform.loop.unroll %loops_5 {factor = 2 : i64} : !transform.any_op
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
@@ -76,73 +71,61 @@
 # CHECK-NEXT:  module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:    func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c4 = arith.constant 4 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:        %c32 = arith.constant 32 : index
-# CHECK-NEXT:        %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_2 to %c32 step %c1_3 {
-# CHECK-NEXT:          %subview_4 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
-# CHECK-NEXT:      %c0_0 = arith.constant 0 : index
 # CHECK-NEXT:      %c512 = arith.constant 512 : index
-# CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0_0 to %c512 step %c1_1 {
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
+# CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:        %subview_2 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        %subview_3 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:        %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:        %subview_1 = memref.subview %arg2[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
 # CHECK-NEXT:        %alloc = memref.alloc() : memref<1x32xf32, 2>
-# CHECK-NEXT:        %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:        sdist.read %arg1[%arg3, %c0_4] to %alloc : memref<512x32xf32>, memref<1x32xf32, 2>
-# CHECK-NEXT:        %c0_5 = arith.constant 0 : index
-# CHECK-NEXT:        %c4_6 = arith.constant 4 : index
+# CHECK-NEXT:        %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:        sdist.read %arg1[%arg3, %c0_2] to %alloc : memref<512x32xf32>, memref<1x32xf32, 2>
+# CHECK-NEXT:        %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:        %c4 = arith.constant 4 : index
 # CHECK-NEXT:        %c2 = arith.constant 2 : index
-# CHECK-NEXT:        scf.for %arg4 = %c0_5 to %c4_6 step %c2 {
-# CHECK-NEXT:          %subview_7 = memref.subview %subview[%arg4, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:          %subview_8 = memref.subview %alloc[0, 0] [1, 32] [1, 1] : memref<1x32xf32, 2> to memref<1x32xf32, strided<[32, 1]>, 2>
-# CHECK-NEXT:          %subview_9 = memref.subview %subview_3[%arg4, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:        scf.for %arg4 = %c0_3 to %c4 step %c2 {
+# CHECK-NEXT:          %subview_4 = memref.subview %subview[%arg4, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:          %subview_5 = memref.subview %alloc[0, 0] [1, 32] [1, 1] : memref<1x32xf32, 2> to memref<1x32xf32, strided<[32, 1]>, 2>
+# CHECK-NEXT:          %subview_6 = memref.subview %subview_1[%arg4, 0] [2, 32] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:          %c0_7 = arith.constant 0 : index
 # CHECK-NEXT:          %c32 = arith.constant 32 : index
 # CHECK-NEXT:          %c16 = arith.constant 16 : index
-# CHECK-NEXT:          scf.for %arg5 = %c0_10 to %c32 step %c16 {
-# CHECK-NEXT:            %subview_11 = memref.subview %subview_7[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_12 = memref.subview %subview_8[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1]>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:            %subview_13 = memref.subview %subview_9[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_14 = arith.constant 0 : index
-# CHECK-NEXT:            %c2_15 = arith.constant 2 : index
-# CHECK-NEXT:            %c1_16 = arith.constant 1 : index
-# CHECK-NEXT:            %c2_17 = arith.constant 2 : index
-# CHECK-NEXT:            %subview_18 = memref.subview %subview_11[%c0_14, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_19 = memref.subview %subview_12[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:            %subview_20 = memref.subview %subview_13[%c0_14, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_21 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_22 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_21 to %c16_22 step %c1_23 {
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_18[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_19[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:              %subview_33 = memref.subview %subview_20[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_31, %subview_32 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_33 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:          scf.for %arg5 = %c0_7 to %c32 step %c16 {
+# CHECK-NEXT:            %subview_8 = memref.subview %subview_4[0, 0] [2, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_9 = memref.subview %subview_5[0, %arg5] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1]>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:            %subview_10 = memref.subview %subview_6[0, %arg5] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:            %c2_12 = arith.constant 2 : index
+# CHECK-NEXT:            %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:            %c2_14 = arith.constant 2 : index
+# CHECK-NEXT:            %subview_15 = memref.subview %subview_8[%c0_11, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_16 = memref.subview %subview_9[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:            %subview_17 = memref.subview %subview_10[%c0_11, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_18 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_19 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_18 to %c16_19 step %c1_20 {
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_15[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_16[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:              %subview_30 = memref.subview %subview_17[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_28, %subview_29 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_30 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
-# CHECK-NEXT:            %c1_24 = arith.constant 1 : index
-# CHECK-NEXT:            %0 = arith.muli %c1_16, %c1_24 : index
-# CHECK-NEXT:            %1 = arith.addi %c0_14, %0 : index
-# CHECK-NEXT:            %subview_25 = memref.subview %subview_11[%1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:            %subview_26 = memref.subview %subview_12[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:            %subview_27 = memref.subview %subview_13[%1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:            %c0_28 = arith.constant 0 : index
-# CHECK-NEXT:            %c16_29 = arith.constant 16 : index
-# CHECK-NEXT:            %c1_30 = arith.constant 1 : index
-# CHECK-NEXT:            scf.for %arg6 = %c0_28 to %c16_29 step %c1_30 {
-# CHECK-NEXT:              %subview_31 = memref.subview %subview_25[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:              %subview_32 = memref.subview %subview_26[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
-# CHECK-NEXT:              %subview_33 = memref.subview %subview_27[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_31, %subview_32 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_33 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:            %c1_21 = arith.constant 1 : index
+# CHECK-NEXT:            %0 = arith.muli %c1_13, %c1_21 : index
+# CHECK-NEXT:            %1 = arith.addi %c0_11, %0 : index
+# CHECK-NEXT:            %subview_22 = memref.subview %subview_8[%1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:            %subview_23 = memref.subview %subview_9[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x16xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:            %subview_24 = memref.subview %subview_10[%1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:            %c0_25 = arith.constant 0 : index
+# CHECK-NEXT:            %c16_26 = arith.constant 16 : index
+# CHECK-NEXT:            %c1_27 = arith.constant 1 : index
+# CHECK-NEXT:            scf.for %arg6 = %c0_25 to %c16_26 step %c1_27 {
+# CHECK-NEXT:              %subview_28 = memref.subview %subview_22[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:              %subview_29 = memref.subview %subview_23[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>, 2> to memref<1x1xf32, strided<[32, 1], offset: ?>, 2>
+# CHECK-NEXT:              %subview_30 = memref.subview %subview_24[0, %arg6] [1, 1] [1, 1] : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:              linalg.matmul {__xtc_id_C_} ins(%subview_28, %subview_29 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>, 2>) outs(%subview_30 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:            } {"./j1"}
 # CHECK-NEXT:          } {"./j"}
 # CHECK-NEXT:        } {"./i"}
diff --git a/tests/filecheck/schedules/test_descript_slice_bigger.py b/tests/filecheck/schedules/test_descript_slice_bigger.py
index f295916a..e3b929b4 100644
--- a/tests/filecheck/schedules/test_descript_slice_bigger.py
+++ b/tests/filecheck/schedules/test_descript_slice_bigger.py
@@ -62,38 +62,33 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "C/k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "C/j" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.split %tiled_linalg_op_4 after 32  {dimension = 0 : i64} : !transform.any_op
-# CHECK-NEXT:      %3:2 = transform.split_handle %2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %3#0 tile_sizes [32, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "C/i[0]/i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "C/i[0]/i0" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "C/i[0]/k0" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_10) : (!transform.any_op) -> ()
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3#1 tile_sizes [18, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "C/i[1]/i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "C/i[1]/i0" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "C/i[1]/k0" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_16) : (!transform.any_op) -> ()
-# CHECK-NEXT:      %4 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %4 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "C/k" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "C/j" : !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.split %tiled_linalg_op_0 after 32  {dimension = 0 : i64} : !transform.any_op
+# CHECK-NEXT:      %2:2 = transform.split_handle %1 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %2#0 tile_sizes [32, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "C/i[0]/i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "C/i[0]/i0" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "C/i[0]/k0" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_6) : (!transform.any_op) -> ()
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2#1 tile_sizes [18, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "C/i[1]/i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "C/i[1]/i0" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "C/i[1]/k0" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_12) : (!transform.any_op) -> ()
+# CHECK-NEXT:      %3 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %3 {
 # CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
 # CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %4 {
+# CHECK-NEXT:      transform.apply_patterns to %3 {
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
 # CHECK-NEXT:      } : !transform.any_op
@@ -107,20 +102,13 @@
 # CHECK-NEXT:      %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
 # CHECK-NEXT:      %c18 = arith.constant 18 : index
 # CHECK-NEXT:      %0 = ub.poison : f32
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
 # CHECK-NEXT:      %c32 = arith.constant 32 : index
 # CHECK-NEXT:      %c64 = arith.constant 64 : index
-# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c50 = arith.constant 50 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c50 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 64] [1, 1] : memref<50x64xf32> to memref<1x64xf32, strided<[64, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c64 step %c1 {
-# CHECK-NEXT:          %subview_1 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x64xf32, strided<[64, 1], offset: ?>> to memref<1x1xf32, strided<[64, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%subview_1 : memref<1x1xf32, strided<[64, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%arg2 : memref<50x64xf32>)
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c64 step %c32 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [50, 32] [1, 1] : memref<50x64xf32> to memref<50x32xf32, strided<[64, 1], offset: ?>>
 # CHECK-NEXT:        %subview_1 = memref.subview %arg1[%arg3, 0] [32, 64] [1, 1] : memref<64x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>>
diff --git a/tests/filecheck/schedules/test_descript_slice_smaller.py b/tests/filecheck/schedules/test_descript_slice_smaller.py
index 551f001b..76323e0d 100644
--- a/tests/filecheck/schedules/test_descript_slice_smaller.py
+++ b/tests/filecheck/schedules/test_descript_slice_smaller.py
@@ -62,38 +62,33 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "C/k" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "C/j" : !transform.any_op
-# CHECK-NEXT:      %2 = transform.structured.split %tiled_linalg_op_4 after 18  {dimension = 0 : i64} : !transform.any_op
-# CHECK-NEXT:      %3:2 = transform.split_handle %2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %3#0 tile_sizes [18, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "C/i[0]/i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "C/i[0]/i0" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_11 "C/i[0]/k0" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_10) : (!transform.any_op) -> ()
-# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3#1 tile_sizes [32, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_13 "C/i[1]/i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_15 "C/i[1]/i0" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_17 "C/i[1]/k0" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_16) : (!transform.any_op) -> ()
-# CHECK-NEXT:      %4 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %4 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "C/k" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "C/j" : !transform.any_op
+# CHECK-NEXT:      %1 = transform.structured.split %tiled_linalg_op_0 after 18  {dimension = 0 : i64} : !transform.any_op
+# CHECK-NEXT:      %2:2 = transform.split_handle %1 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %2#0 tile_sizes [18, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "C/i[0]/i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "C/i[0]/i0" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_7 "C/i[0]/k0" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_6) : (!transform.any_op) -> ()
+# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2#1 tile_sizes [32, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_9 "C/i[1]/i" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_11 "C/i[1]/i0" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_13 "C/i[1]/k0" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_12) : (!transform.any_op) -> ()
+# CHECK-NEXT:      %3 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %3 {
 # CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
 # CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %4 {
+# CHECK-NEXT:      transform.apply_patterns to %3 {
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
 # CHECK-NEXT:      } : !transform.any_op
@@ -106,21 +101,14 @@
 # CHECK-NEXT:    func.func @matmul(%arg0: memref<50x64xf32> {llvm.noalias}, %arg1: memref<64x64xf32> {llvm.noalias}, %arg2: memref<50x64xf32> {llvm.noalias}) {
 # CHECK-NEXT:      %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
 # CHECK-NEXT:      %0 = ub.poison : f32
+# CHECK-NEXT:      %c1 = arith.constant 1 : index
 # CHECK-NEXT:      %c18 = arith.constant 18 : index
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
 # CHECK-NEXT:      %c32 = arith.constant 32 : index
 # CHECK-NEXT:      %c64 = arith.constant 64 : index
-# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:      %c0 = arith.constant 0 : index
-# CHECK-NEXT:      %c50 = arith.constant 50 : index
-# CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c50 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 64] [1, 1] : memref<50x64xf32> to memref<1x64xf32, strided<[64, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c64 step %c1 {
-# CHECK-NEXT:          %subview_1 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x64xf32, strided<[64, 1], offset: ?>> to memref<1x1xf32, strided<[64, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%subview_1 : memref<1x1xf32, strided<[64, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%arg2 : memref<50x64xf32>)
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c64 step %c32 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [50, 32] [1, 1] : memref<50x64xf32> to memref<50x32xf32, strided<[64, 1], offset: ?>>
 # CHECK-NEXT:        %subview_1 = memref.subview %arg1[%arg3, 0] [32, 64] [1, 1] : memref<64x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>>
diff --git a/tests/filecheck/schedules/test_matmul_descript_mlir.py b/tests/filecheck/schedules/test_matmul_descript_mlir.py
index 814ae0e6..d6b9540f 100644
--- a/tests/filecheck/schedules/test_matmul_descript_mlir.py
+++ b/tests/filecheck/schedules/test_matmul_descript_mlir.py
@@ -56,28 +56,23 @@
 # CHECK-NEXT:      transform.yield 
 # CHECK-NEXT:    }
 # CHECK-NEXT:    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops "./i" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:      %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_3 "C/K" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_5 "C/I" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_7 "C/J" : !transform.any_op
-# CHECK-NEXT:      %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:      transform.annotate %loops_9 "C/I0" : !transform.any_op
-# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> ()
-# CHECK-NEXT:      transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
-# CHECK-NEXT:      %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      %0 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops "C/K" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_1 "C/I" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_3 "C/J" : !transform.any_op
+# CHECK-NEXT:      %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:      transform.annotate %loops_5 "C/I0" : !transform.any_op
+# CHECK-NEXT:      transform.include @_vecto failures(suppress) (%tiled_linalg_op_4) : (!transform.any_op) -> ()
+# CHECK-NEXT:      transform.loop.unroll %loops_5 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:      %1 = transform.get_parent_op %loops {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.reduction_to_contract
 # CHECK-NEXT:        transform.apply_patterns.vector.transfer_permutation_patterns
 # CHECK-NEXT:      } : !transform.any_op
-# CHECK-NEXT:      transform.apply_patterns to %2 {
+# CHECK-NEXT:      transform.apply_patterns to %1 {
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_outerproduct
 # CHECK-NEXT:        transform.apply_patterns.vector.lower_contraction
 # CHECK-NEXT:      } : !transform.any_op
@@ -91,20 +86,14 @@
 # CHECK-NEXT:      %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
 # CHECK-NEXT:      %0 = ub.poison : f32
 # CHECK-NEXT:      %c16 = arith.constant 16 : index
-# CHECK-NEXT:      %c2 = arith.constant 2 : index
-# CHECK-NEXT:      %c512 = arith.constant 512 : index
 # CHECK-NEXT:      %c32 = arith.constant 32 : index
-# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %c2 = arith.constant 2 : index
 # CHECK-NEXT:      %c4 = arith.constant 4 : index
 # CHECK-NEXT:      %c1 = arith.constant 1 : index
-# CHECK-NEXT:      scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:        %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:        scf.for %arg4 = %c0 to %c32 step %c1 {
-# CHECK-NEXT:          %subview_1 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:          linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:        } {"./j"}
-# CHECK-NEXT:      } {"./i"}
+# CHECK-NEXT:      %c512 = arith.constant 512 : index
+# CHECK-NEXT:      %c0 = arith.constant 0 : index
+# CHECK-NEXT:      %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:      linalg.fill {__xtc_id_C_0_} ins(%cst_0 : f32) outs(%arg2 : memref<4x32xf32>)
 # CHECK-NEXT:      scf.for %arg3 = %c0 to %c512 step %c1 {
 # CHECK-NEXT:        %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
 # CHECK-NEXT:        %subview_1 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>