From f954a5c7053becec42a9fdc5cd4add64c16e98d6 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Tue, 3 Feb 2026 12:32:30 +0100
Subject: [PATCH 01/14] mlir backend: parameterized xdsl type

---
 src/xtc/backends/mlir/MlirGraphBackend.py | 17 ++++++++++-------
 src/xtc/backends/mlir/MlirNodeBackend.py  | 14 ++++++++------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index 191cad027..c39fc5b2e 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -7,7 +7,7 @@
 
 from xdsl.dialects.func import FuncOp as xdslFuncOp
 from xdsl.dialects import func, memref
-from xdsl.dialects.builtin import MemRefType, f32, f64
+from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64
 from xdsl.ir import Region, Block, Operation
 from xdsl.builder import ImplicitBuilder
 
@@ -28,7 +28,9 @@ def __init__(
         concluding_passes: list[str] = [],
         always_vectorize: bool = False,
         no_alias: bool = True,
+        use_tensor_dialect: bool = False,
     ):
+        self.xdsl_type = TensorType if use_tensor_dialect else MemRefType
         if isinstance(xdsl_func, XTCGraph):
             assert nodes is None
             graph = xdsl_func
@@ -128,6 +130,7 @@ def _init_from_graph(
                     always_vectorize=always_vectorize,
                     concluding_passes=concluding_passes,
                     id=f"__xtc_id_{node_id}_",
+                    xdsl_type=self.xdsl_type
                 )
         return payload, nodes_dict
 
@@ -137,10 +140,10 @@ def _xdsl_elt_shape_from_tensortype(self, type: XTCTensorType) -> tuple[Any, Any
 
     def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any:
         elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
-        return MemRefType(elt_type, shape)
+        return self.xdsl_type(elt_type, shape)
 
     def _np_types_spec(
-        self, types: list[MemRefType]
+        self, types: list[MemRefType | TensorType]
     ) -> list[dict[str, tuple[int, ...] | str]]:
         types_map = {"f32": "float32", "f64": "float64"}
         types_spec: list[dict[str, tuple[int, ...] | str]] = [
@@ -156,12 +159,12 @@ def _np_types_spec(
     def np_inputs_spec(self) -> list[dict[str, Any]]:
         # Assume inputs are first, and output is single last param
         inputs_args_types = [arg.type for arg in self.xdsl_func.args[:-1]]
-        list_memref_tys = cast(list[MemRefType], inputs_args_types)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types)
+        return self._np_types_spec(list_xdsl_tys)
 
     @override
     def np_outputs_spec(self) -> list[dict[str, Any]]:
         # Assume inputs are first, and output is single last param
         outputs_args_types = [arg.type for arg in self.xdsl_func.args[-1:]]
-        list_memref_tys = cast(list[MemRefType], outputs_args_types)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], outputs_args_types)
+        return self._np_types_spec(list_xdsl_tys)
diff --git a/src/xtc/backends/mlir/MlirNodeBackend.py b/src/xtc/backends/mlir/MlirNodeBackend.py
index 135e99b80..8ae0c97e7 100644
--- a/src/xtc/backends/mlir/MlirNodeBackend.py
+++ b/src/xtc/backends/mlir/MlirNodeBackend.py
@@ -6,7 +6,7 @@
 from typing_extensions import override
 
 from xdsl.ir import Operation as xdslOperation
-from xdsl.dialects.builtin import MemRefType as xdslAnyMemRefType
+from xdsl.dialects.builtin import MemRefType, TensorType
 from xdsl.dialects.builtin import UnitAttr as xdslUnitAttr
 from xtc.utils.xdsl_aux import xdsl_operator_to_function
 
@@ -26,8 +26,10 @@ def __init__(
         always_vectorize: bool = False,
         no_alias: bool = True,
         id: str | None = None,
+        xdsl_type: MemRefType | TensorType = MemRefType,
     ):
         self._graph = None
+        self.xdsl_type = xdsl_type
         if id is None:
             self.op_id_attribute = f"__id{MlirNodeBackend.count}__"
             MlirNodeBackend.count += 1
@@ -48,7 +50,7 @@ def __init__(
         self.loop_stamps = loop_stamps
 
     def _np_types_spec(
-        self, types: list[xdslAnyMemRefType]
+        self, types: list[MemRefType | TensorType]
     ) -> list[dict[str, tuple[int, ...] | str]]:
         types_map = {"f32": "float32", "f64": "float64"}
         types_spec: list[dict[str, tuple[int, ...] | str]] = [
@@ -63,11 +65,11 @@ def _np_types_spec(
     @override
     def np_inputs_spec(self) -> list[dict[str, Any]]:
         list_attr_tys = [i.type for i in self.source_op.inputs]  # type: ignore
-        list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)
+        return self._np_types_spec(list_xdsl_tys)
 
     @override
     def np_outputs_spec(self) -> list[dict[str, Any]]:
         list_attr_tys = [i.type for i in self.source_op.outputs]  # type: ignore
-        list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)
+        return self._np_types_spec(list_xdsl_tys)

From 551b946dc239930e289efa76d01a329a1d1fea7b Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Wed, 4 Feb 2026 10:42:57 +0100
Subject: [PATCH 02/14] tensor: added matmul, tensor graph parsing

---
 src/xtc/backends/mlir/MlirGraphBackend.py     | 10 +++-
 src/xtc/backends/mlir/MlirOps.py              | 56 +++++++++++++------
 src/xtc/utils/xdsl_aux.py                     |  8 ++-
 .../backends/test_matmul_mlir_tensor.py       | 31 ++++++++++
 4 files changed, 83 insertions(+), 22 deletions(-)
 create mode 100644 tests/filecheck/backends/test_matmul_mlir_tensor.py

diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index c39fc5b2e..ab2cc09c5 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -64,7 +64,7 @@ def _init_from_xdsl(
     def _xdsl_generate_node(
         self, node: XTCNode, block: Block, variables: dict[str, Any]
     ):
-        operation = MlirOperation.from_operation(node.operation, name=node.name)
+        operation = MlirOperation.from_operation(node.operation, name=node.name, op_type=self.xdsl_type)
         names = [*node.inputs, *node.outputs]
         assert node.inputs_types is not None and node.outputs_types is not None
         types = [*node.inputs_types, *node.outputs_types]
@@ -108,13 +108,17 @@ def _init_from_graph(
         for node in graph.nodes.values():
             node_attrs = self._xdsl_generate_node(node, inlined_block, variables)
             block_attrs.append(node_attrs)
+        return_val =  block_attrs[-1]["nodes_map"]["return_node_id"]
         with ImplicitBuilder(inlined_block):
-            func.ReturnOp()
+            if return_val:
+                func.ReturnOp(return_val)
+            else:
+                func.ReturnOp()
         region = Region([inlined_block])  # type: ignore # issue with mypy
         payload = xdslFuncOp.from_region(
             name=graph.name,
             input_types=params_types,
-            return_types=[],
+            return_types=[params_types[-1]] if return_val else [],
             region=region,
         )
         nodes_dict = {}
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index be687b444..14e5fb47f 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -7,9 +7,11 @@
 from typing_extensions import override
 from typing import Any, Type, TypeAlias, cast
 
-from xdsl.dialects import linalg, arith, builtin, memref
+from xdsl.dialects import linalg, arith, builtin, memref, tensor
 from xdsl.dialects.builtin import (
     MemRefType,
+    TensorType,
+    Sequence,
     f32,
     f64,
     i64,
@@ -42,8 +44,9 @@ def __init__(
         args: tuple[Any, ...],
         attrs: dict[str, Any] = {},
         name: str | None = None,
+        op_type: MemRefType | TensorType = MemRefType,
     ) -> None:
-        self.operator = operator(args, attrs, name=name)
+        self.operator = operator(args, attrs, name=name, op_type=op_type)
         self.args = args
         self.attrs = attrs
         self.name = self.operator.name if name is None else name
@@ -78,7 +81,7 @@ def np_outputs_spec(self) -> list[dict[str, Any]]:
         return outputs_spec
 
     @classmethod
-    def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation":
+    def from_operation(cls, xtc_op: Operation, name: str | None, op_type: MemRefType | TensorType) -> "MlirOperation":
         dims = xtc_op.dims.values()
         dtype = xtc_op.inputs_types[0].dtype  # TODO: currently get dtype from 1st arg
         args = tuple([*dims, dtype])
@@ -88,6 +91,7 @@ def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation":
             args,
             dict(attrs),
             name=name,
+            op_type=op_type,
         )
 
 
@@ -97,11 +101,12 @@ class MlirOperator(ABC):
     KINDS = ""
 
     def __init__(
-        self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None
+            self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None, op_type: MemRefType | TensorType = MemRefType
     ) -> None:
         self.args = args
         self.attrs = {**attrs}
         self.name = name if name is not None else self.DEFAULT_NAME
+        self.op_type = op_type
 
     @abstractmethod
     def generate_op(
@@ -149,24 +154,42 @@ def generate_op(
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
+                self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 3
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
-            fill = linalg.FillOp(
-                res=(),
-                inputs=(cst0.results[0],),
-                outputs=(args[2],),
-            )
-            reduce = linalg.MatmulOp(
-                res=(),
-                inputs=(args[0], args[1]),
-                outputs=(args[2],),
-            )
+
+            if self.op_type == MemRefType:
+                fill = linalg.FillOp(
+                    res=(),
+                    inputs=(cst0.results[0],),
+                    outputs=(args[2],),
+                )
+                reduce = linalg.MatmulOp(
+                    res=(),
+                    inputs=(args[0], args[1]),
+                    outputs=(args[2],),
+                )
+            else:
+                empty = tensor.EmptyOp(
+                    dynamic_sizes=[],
+                    tensor_type=args[2].type,
+                )
+                fill = linalg.FillOp(
+                    res=(empty.results[0].type,),
+                    inputs=(cst0.results[0],),
+                    outputs=(empty.results[0],),
+                )
+                reduce = linalg.MatmulOp(
+                    res=(args[2].type,),
+                    inputs=(args[0], args[1]),
+                    outputs=(fill.results[0],),
+                )
+
         fill_node_id = f"{self.name}_0"
         reduce_node_id = f"{self.name}"
         fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
@@ -175,6 +198,7 @@ def generate_op(
             "nodes_map": {
                 fill_node_id: fill,
                 reduce_node_id: reduce,
+                "return_node_id": reduce if self.op_type == TensorType else None,
             },
             "dims_sizes": [
                 {"i": Ki, "j": Kj},
diff --git a/src/xtc/utils/xdsl_aux.py b/src/xtc/utils/xdsl_aux.py
index c339f02d3..0061ccba5 100644
--- a/src/xtc/utils/xdsl_aux.py
+++ b/src/xtc/utils/xdsl_aux.py
@@ -12,6 +12,7 @@
 from xdsl.dialects.arith import ConstantOp
 from xdsl.dialects.builtin import (
     MemRefType,
+    TensorType,
     IntegerAttr,
     FloatAttr,
     IntegerType,
@@ -19,7 +20,7 @@
 
 from xdsl.context import Context
 from xdsl.parser import Parser
-from xdsl.dialects import func, linalg, arith, memref
+from xdsl.dialects import func, linalg, arith, memref, tensor
 from xdsl.dialects.builtin import ModuleOp
 
 
@@ -29,6 +30,7 @@ def parse_xdsl_module(source: str) -> ModuleOp:
     context.load_dialect(linalg.Linalg)
     context.load_dialect(arith.Arith)
     context.load_dialect(memref.MemRef)
+    context.load_dialect(tensor.Tensor)
     parser = Parser(context, source)
     module = parser.parse_module()
     return module
@@ -39,7 +41,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp:
     operands = source_op.operands
     shaped_types, scalar_types = [], []
     for o in operands:
-        if isa(o.type, MemRefType):
+        if isa(o.type, MemRefType) or isa(o.type, TensorType):
             shaped_types.append(o.type)
         else:
             scalar_types.append(o.type)
@@ -49,7 +51,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp:
     concrete_operands = []
     shaped_count, scalar_count = 0, 0
     for o in operands:
-        if isa(o.type, MemRefType):
+        if isa(o.type, MemRefType) or isa(o.type, TensorType):
             concrete_operands.append(payload.args[shaped_count])
             shaped_count += 1
         else:
diff --git a/tests/filecheck/backends/test_matmul_mlir_tensor.py b/tests/filecheck/backends/test_matmul_mlir_tensor.py
new file mode 100644
index 000000000..29d0392a8
--- /dev/null
+++ b/tests/filecheck/backends/test_matmul_mlir_tensor.py
@@ -0,0 +1,31 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="matmul") as gb:
+    O.matmul(a, b, name="C")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph)
+
+sch = impl.get_scheduler()
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="matmul_mlir",
+    print_source_ir=True,
+    print_transformed_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")

From 2800d44a54b7a049a679000673f1f3eec9652cc1 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Fri, 6 Feb 2026 12:17:33 +0100
Subject: [PATCH 03/14] tensor-dialect: added bufferization pass, single matmul
 ops

---
 src/xtc/backends/mlir/MlirCompiler.py         |  16 +++
 src/xtc/backends/mlir/MlirCompilerPasses.py   |  15 +++
 src/xtc/backends/mlir/MlirConfig.py           |   1 +
 src/xtc/backends/mlir/MlirGraphBackend.py     |  62 ++++++---
 src/xtc/backends/mlir/MlirNodeBackend.py      |   8 +-
 src/xtc/backends/mlir/MlirOps.py              |  37 ++++--
 .../tensor_dialect/test_matmul_mlir_tensor.py | 120 ++++++++++++++++++
 .../backends/test_matmul_mlir_tensor.py       |  31 -----
 8 files changed, 226 insertions(+), 64 deletions(-)
 create mode 100644 tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
 delete mode 100644 tests/filecheck/backends/test_matmul_mlir_tensor.py

diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index b4c9bfe34..9d3e55354 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -20,6 +20,7 @@
 from xtc.backends.mlir.MlirCompilerPasses import (
     MlirProgramInsertTransformPass,
     MlirProgramApplyTransformPass,
+    MlirProgramApplyPasses,
 )
 
 from xtc.backends.mlir.MlirTarget import (
@@ -149,6 +150,19 @@ def mlir_apply_transform_pass(self) -> None:
         if self._config.print_transformed_ir:
             self.dump_ir("IR Dump After transform")
 
+    def mlir_apply_tensor_lowering_pass(self) -> None:
+        apply_transform_pass = MlirProgramApplyPasses(
+            mlir_program=self._mlir_program,
+        )
+        apply_transform_pass.run(
+            [
+                "eliminate-empty-tensors",  # causes ops to write directly to out buffer
+                "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map}",
+            ]
+        )
+        if self._config.print_bufferization_ir:
+            self.dump_ir("IR Dump After Lowering")
+
     def _save_temp(self, fname: str, content: Any) -> None:
         if not self._config.save_temps:
             return
@@ -190,6 +204,8 @@ def compile(self) -> None:
 
         save_temp(src_ir_dump_file, self._mlir_program.mlir_module)
 
+        self.mlir_apply_tensor_lowering_pass()
+
         self.mlir_insert_transform_pass()
         save_temp(mlir_btrn_dump_file, self._mlir_program.mlir_module)
 
diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index de33ff28d..7a7e2da7e 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -534,3 +534,18 @@ def run(self) -> None:
                 transform_op.erase()
             else:
                 break
+
+
+class MlirProgramApplyPasses:
+    def __init__(
+        self,
+        mlir_program: RawMlirProgram,
+    ) -> None:
+        self._mlir_program = mlir_program
+
+    def run(self, pass_names: list[str]) -> None:
+        ctx = self._mlir_program.mlir_context
+        pm = PassManager(context=ctx)
+        for name in pass_names:
+            pm.add(name)  # type: ignore # no attribute add
+        pm.run(self._mlir_program.mlir_module.operation)
diff --git a/src/xtc/backends/mlir/MlirConfig.py b/src/xtc/backends/mlir/MlirConfig.py
index 2d0ab5128..653456815 100644
--- a/src/xtc/backends/mlir/MlirConfig.py
+++ b/src/xtc/backends/mlir/MlirConfig.py
@@ -22,6 +22,7 @@ class MlirConfig:
     print_assembly: bool = False
     visualize_jumps: bool = True
     print_lowered_ir: bool = False
+    print_bufferization_ir: bool = False
     debug: bool = False
     color: bool = False
     concluding_passes: list[str] = field(default_factory=list)
diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index ab2cc09c5..3f88cc793 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024-2026 The XTC Project Authors
 #
-from typing import cast, Any
+from typing import cast, Any, Type
 from typing_extensions import override
 
 from xdsl.dialects.func import FuncOp as xdslFuncOp
-from xdsl.dialects import func, memref
-from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64
+from xdsl.dialects import func, memref, bufferization
+from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64, UnitAttr
 from xdsl.ir import Region, Block, Operation
 from xdsl.builder import ImplicitBuilder
 
@@ -30,7 +30,9 @@ def __init__(
         no_alias: bool = True,
         use_tensor_dialect: bool = False,
     ):
-        self.xdsl_type = TensorType if use_tensor_dialect else MemRefType
+        self.xdsl_type: Type[TensorType] | Type[MemRefType] = (
+            TensorType if use_tensor_dialect else MemRefType
+        )
         if isinstance(xdsl_func, XTCGraph):
             assert nodes is None
             graph = xdsl_func
@@ -64,7 +66,11 @@ def _init_from_xdsl(
     def _xdsl_generate_node(
         self, node: XTCNode, block: Block, variables: dict[str, Any]
     ):
-        operation = MlirOperation.from_operation(node.operation, name=node.name, op_type=self.xdsl_type)
+        operation = MlirOperation.from_operation(
+            node.operation,
+            name=node.name,
+            op_type=self.xdsl_type,  # type: ignore
+        )
         names = [*node.inputs, *node.outputs]
         assert node.inputs_types is not None and node.outputs_types is not None
         types = [*node.inputs_types, *node.outputs_types]
@@ -81,7 +87,8 @@ def _xdsl_generate_node(
             variables[name] = alloca.results[0]
         args = [variables[name] for name in names]
         _, attrs = operation.generate(block=block, args=args)
-        return attrs
+        last_node = attrs["nodes_map"].get("return_node_id")
+        return attrs, last_node
 
     def _init_from_graph(
         self,
@@ -97,28 +104,43 @@ def _init_from_graph(
         )
         params_types = [
             self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type))
-            for tensor_type in [*inputs_types, *outputs_types]
+            for tensor_type in inputs_types  # [*inputs_types, *outputs_types]
         ]
+        # graph output types are always memrefs
+        params_types.extend(
+            self._memref_type_from_tensortype(cast(XTCTensorType, tensor_type))
+            for tensor_type in outputs_types
+        )
         inlined_block = Block(arg_types=params_types)
         variables = {
             name: arg
             for name, arg in zip([*graph.inputs, *graph.outputs], inlined_block.args)
         }
         block_attrs = []
+        last_node = None
         for node in graph.nodes.values():
-            node_attrs = self._xdsl_generate_node(node, inlined_block, variables)
+            node_attrs, last_node = self._xdsl_generate_node(
+                node, inlined_block, variables
+            )
             block_attrs.append(node_attrs)
-        return_val =  block_attrs[-1]["nodes_map"]["return_node_id"]
         with ImplicitBuilder(inlined_block):
-            if return_val:
-                func.ReturnOp(return_val)
-            else:
-                func.ReturnOp()
+            if self.xdsl_type == TensorType:
+                assert last_node
+                reduce = bufferization.MaterializeInDestinationOp(
+                    # operands=((inlined_block.args[-1],), (last_node.results[0],)),
+                    operands=((last_node.results[0],), (inlined_block.args[-1],)),
+                    # result_types=(last_node.results[0].type,),
+                    # result_types=(inlined_block.args[-1].type,),
+                    result_types=((),),
+                )
+                reduce.attributes["writable"] = UnitAttr()
+                reduce.attributes["restrict"] = UnitAttr()
+            func.ReturnOp()
         region = Region([inlined_block])  # type: ignore # issue with mypy
         payload = xdslFuncOp.from_region(
             name=graph.name,
             input_types=params_types,
-            return_types=[params_types[-1]] if return_val else [],
+            return_types=[],
             region=region,
         )
         nodes_dict = {}
@@ -134,7 +156,7 @@ def _init_from_graph(
                     always_vectorize=always_vectorize,
                     concluding_passes=concluding_passes,
                     id=f"__xtc_id_{node_id}_",
-                    xdsl_type=self.xdsl_type
+                    xdsl_type=self.xdsl_type,
                 )
         return payload, nodes_dict
 
@@ -146,8 +168,12 @@ def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any:
         elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
         return self.xdsl_type(elt_type, shape)
 
+    def _memref_type_from_tensortype(self, type: XTCTensorType) -> Any:
+        elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
+        return MemRefType(elt_type, shape)
+
     def _np_types_spec(
-        self, types: list[MemRefType | TensorType]
+        self, types: list[MemRefType] | list[TensorType]
     ) -> list[dict[str, tuple[int, ...] | str]]:
         types_map = {"f32": "float32", "f64": "float64"}
         types_spec: list[dict[str, tuple[int, ...] | str]] = [
@@ -163,12 +189,12 @@ def _np_types_spec(
     def np_inputs_spec(self) -> list[dict[str, Any]]:
         # Assume inputs are first, and output is single last param
         inputs_args_types = [arg.type for arg in self.xdsl_func.args[:-1]]
-        list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types)
+        list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types)  # type: ignore
         return self._np_types_spec(list_xdsl_tys)
 
     @override
     def np_outputs_spec(self) -> list[dict[str, Any]]:
         # Assume inputs are first, and output is single last param
         outputs_args_types = [arg.type for arg in self.xdsl_func.args[-1:]]
-        list_xdsl_tys = cast(list[self.xdsl_type], outputs_args_types)
+        list_xdsl_tys = cast(list[MemRefType], outputs_args_types)
         return self._np_types_spec(list_xdsl_tys)
diff --git a/src/xtc/backends/mlir/MlirNodeBackend.py b/src/xtc/backends/mlir/MlirNodeBackend.py
index 8ae0c97e7..f809e392c 100644
--- a/src/xtc/backends/mlir/MlirNodeBackend.py
+++ b/src/xtc/backends/mlir/MlirNodeBackend.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024-2026 The XTC Project Authors
 #
-from typing import cast, Any
+from typing import cast, Any, Type
 from typing_extensions import override
 
 from xdsl.ir import Operation as xdslOperation
@@ -26,7 +26,7 @@ def __init__(
         always_vectorize: bool = False,
         no_alias: bool = True,
         id: str | None = None,
-        xdsl_type: MemRefType | TensorType = MemRefType,
+        xdsl_type: Type[TensorType] | Type[MemRefType] = MemRefType,
     ):
         self._graph = None
         self.xdsl_type = xdsl_type
@@ -65,11 +65,11 @@ def _np_types_spec(
     @override
     def np_inputs_spec(self) -> list[dict[str, Any]]:
         list_attr_tys = [i.type for i in self.source_op.inputs]  # type: ignore
-        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)  # type: ignore
         return self._np_types_spec(list_xdsl_tys)
 
     @override
     def np_outputs_spec(self) -> list[dict[str, Any]]:
         list_attr_tys = [i.type for i in self.source_op.outputs]  # type: ignore
-        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)  # type: ignore
         return self._np_types_spec(list_xdsl_tys)
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index 14e5fb47f..3b4adada8 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -11,7 +11,6 @@
 from xdsl.dialects.builtin import (
     MemRefType,
     TensorType,
-    Sequence,
     f32,
     f64,
     i64,
@@ -44,7 +43,7 @@ def __init__(
         args: tuple[Any, ...],
         attrs: dict[str, Any] = {},
         name: str | None = None,
-        op_type: MemRefType | TensorType = MemRefType,
+        op_type: Type[MemRefType] | Type[TensorType] = MemRefType,
     ) -> None:
         self.operator = operator(args, attrs, name=name, op_type=op_type)
         self.args = args
@@ -81,7 +80,12 @@ def np_outputs_spec(self) -> list[dict[str, Any]]:
         return outputs_spec
 
     @classmethod
-    def from_operation(cls, xtc_op: Operation, name: str | None, op_type: MemRefType | TensorType) -> "MlirOperation":
+    def from_operation(
+        cls,
+        xtc_op: Operation,
+        name: str | None,
+        op_type: Type[MemRefType] | Type[TensorType],
+    ) -> "MlirOperation":
         dims = xtc_op.dims.values()
         dtype = xtc_op.inputs_types[0].dtype  # TODO: currently get dtype from 1st arg
         args = tuple([*dims, dtype])
@@ -101,7 +105,11 @@ class MlirOperator(ABC):
     KINDS = ""
 
     def __init__(
-            self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None, op_type: MemRefType | TensorType = MemRefType
+        self,
+        args: tuple[Any, ...],
+        attrs: dict[str, Any],
+        name: str | None = None,
+        op_type: Type[MemRefType] | Type[TensorType] = MemRefType,
     ) -> None:
         self.args = args
         self.attrs = {**attrs}
@@ -154,12 +162,15 @@ def generate_op(
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
+                self.op_type(elt_type, shape)
+                for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 3
-        assert all(isinstance(arg.type, self.op_type) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args[:-1])
+        # output arg is always a memref (for now)
+        assert isinstance(args[-1].type, MemRefType)
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
 
@@ -175,9 +186,10 @@ def generate_op(
                     outputs=(args[2],),
                 )
             else:
+                out_tensor_type = TensorType(elt_type, [Ki, Kj])
                 empty = tensor.EmptyOp(
                     dynamic_sizes=[],
-                    tensor_type=args[2].type,
+                    tensor_type=out_tensor_type,
                 )
                 fill = linalg.FillOp(
                     res=(empty.results[0].type,),
@@ -185,11 +197,10 @@ def generate_op(
                     outputs=(empty.results[0],),
                 )
                 reduce = linalg.MatmulOp(
-                    res=(args[2].type,),
+                    res=(fill.results[0].type,),
                     inputs=(args[0], args[1]),
                     outputs=(fill.results[0],),
                 )
-
         fill_node_id = f"{self.name}_0"
         reduce_node_id = f"{self.name}"
         fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
@@ -198,7 +209,7 @@ def generate_op(
             "nodes_map": {
                 fill_node_id: fill,
                 reduce_node_id: reduce,
-                "return_node_id": reduce if self.op_type == TensorType else None,
+                "return_node_id": reduce,
             },
             "dims_sizes": [
                 {"i": Ki, "j": Kj},
@@ -247,7 +258,11 @@ class MlirOperatorConv2D(MlirOperator):
     DEFAULT_STRIDE = (1, 1)
 
     def __init__(
-        self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None
+        self,
+        args: tuple[Any, ...],
+        attrs: dict[str, Any],
+        name: str | None = None,
+        op_type: Type[MemRefType] | Type[TensorType] = MemRefType,
     ) -> None:
         attrs = {"stride": self.DEFAULT_STRIDE, **attrs}
         super().__init__(args, attrs, name)
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
new file mode 100644
index 000000000..d363536ee
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
@@ -0,0 +1,120 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="matmul") as gb:
+    O.matmul(a, b, name="C")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="matmul_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>)
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./k" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0 to %c4 step %c1 {
+# CHECK-NEXT:       %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_3 to %c32 step %c1_4 {
+# CHECK-NEXT:         %subview_5 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_3 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:       %subview_4 = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_5 to %c32 step %c1_6 {
+# CHECK-NEXT:         %subview_7 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_8 = memref.subview %subview_3[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_9 = memref.subview %subview_4[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_10 to %c512 step %c1_11 {
+# CHECK-NEXT:           %subview_12 = memref.subview %subview_7[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_13 = memref.subview %subview_8[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_14 = memref.subview %subview_9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_C_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: matmul
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 4x512xfloat32
+# CHECK-NEXT:   - %1 : 512x32xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %2 : 4x32xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
+
diff --git a/tests/filecheck/backends/test_matmul_mlir_tensor.py b/tests/filecheck/backends/test_matmul_mlir_tensor.py
deleted file mode 100644
index 29d0392a8..000000000
--- a/tests/filecheck/backends/test_matmul_mlir_tensor.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# RUN: python %s 2>&1 | filecheck %s
-# UNSUPPORTED: mlir-target=nvgpu
-
-import xtc.graphs.xtc.op as O
-from xtc.backends.mlir import Backend
-
-I, J, K, dtype = 4, 32, 512, "float32"
-a = O.tensor((I, K), dtype, name="A")
-b = O.tensor((K, J), dtype, name="B")
-
-with O.graph(name="matmul") as gb:
-    O.matmul(a, b, name="C")
-
-graph = gb.graph
-print(graph)
-
-impl = Backend(graph)
-
-sch = impl.get_scheduler()
-sched = sch.schedule()
-
-comp = impl.get_compiler(
-    shared_lib=True,
-    dump_file="matmul_mlir",
-    print_source_ir=True,
-    print_transformed_ir=True,
-)
-module = comp.compile(sched)
-executor = module.get_executor(validate=True)
-res = executor.execute()
-print(f"CODE: {res}")

From 6dbe48eaab51f12500769defec838514ec594815 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Fri, 6 Feb 2026 14:27:22 +0100
Subject: [PATCH 04/14] tensor-dialect: support for multiple ops

---
 src/xtc/backends/mlir/MlirGraphBackend.py     |  19 +-
 src/xtc/backends/mlir/MlirOps.py              |  23 ++-
 .../test_two_matmuls_mlir_tensor.py           | 181 ++++++++++++++++++
 3 files changed, 207 insertions(+), 16 deletions(-)
 create mode 100644 tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py

diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index 3f88cc793..d9aa9ed6c 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -6,7 +6,7 @@
 from typing_extensions import override
 
 from xdsl.dialects.func import FuncOp as xdslFuncOp
-from xdsl.dialects import func, memref, bufferization
+from xdsl.dialects import func, memref, tensor, bufferization
 from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64, UnitAttr
 from xdsl.ir import Region, Block, Operation
 from xdsl.builder import ImplicitBuilder
@@ -79,12 +79,19 @@ def _xdsl_generate_node(
                 continue
             with ImplicitBuilder(block):
                 elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
-                alloca = memref.AllocaOp.get(
-                    return_type=elt_type,
-                    shape=shape,
-                    alignment=256,  # Take the default of dlpack lib
+                result_op = (
+                    tensor.EmptyOp(
+                        dynamic_sizes=[],
+                        tensor_type=self._xdsl_type_from_tensortype(type),
+                    )
+                    if self.xdsl_type == TensorType
+                    else memref.AllocaOp.get(
+                        return_type=elt_type,
+                        shape=shape,
+                        alignment=256,  # Take the default of dlpack lib
+                    )
                 )
-            variables[name] = alloca.results[0]
+            variables[name] = result_op.results[0]
         args = [variables[name] for name in names]
         _, attrs = operation.generate(block=block, args=args)
         last_node = attrs["nodes_map"].get("return_node_id")
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index 3b4adada8..088c4195e 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -162,15 +162,15 @@ def generate_op(
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                self.op_type(elt_type, shape)
-                for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
+                self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj]]
             ]
+            ops_types.append(MemRefType(elt_type, [Ki, Kj]))
             block = Block(arg_types=ops_types)
             args = block.args
+        has_tensor_result = isinstance(args[-1].type, TensorType)
         assert len(args) == 3
         assert all(isinstance(arg.type, self.op_type) for arg in args[:-1])
-        # output arg is always a memref (for now)
-        assert isinstance(args[-1].type, MemRefType)
+        assert not (has_tensor_result and self.op_type == MemRefType)
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
 
@@ -186,15 +186,18 @@ def generate_op(
                     outputs=(args[2],),
                 )
             else:
-                out_tensor_type = TensorType(elt_type, [Ki, Kj])
-                empty = tensor.EmptyOp(
-                    dynamic_sizes=[],
-                    tensor_type=out_tensor_type,
+                empty = (
+                    args[2]
+                    if has_tensor_result
+                    else tensor.EmptyOp(
+                        dynamic_sizes=[],
+                        tensor_type=TensorType(elt_type, [Ki, Kj]),
+                    ).results[0]
                 )
                 fill = linalg.FillOp(
-                    res=(empty.results[0].type,),
+                    res=(empty.type,),
                     inputs=(cst0.results[0],),
-                    outputs=(empty.results[0],),
+                    outputs=(empty,),
                 )
                 reduce = linalg.MatmulOp(
                     res=(fill.results[0].type,),
diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
new file mode 100644
index 000000000..a7ea1b96a
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
@@ -0,0 +1,181 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+c = O.tensor((J, I), dtype, name="C")
+
+with O.graph(name="matmul") as gb:
+    d = O.matmul(a, b, name="D")
+    O.matmul(c, d, name="E")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+#impl = Backend(graph, use_tensor_dialect=False)
+
+sch = impl.get_scheduler(default_node = "E")
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="two_matmul_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>)
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
+# CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_D_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_D_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./k" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_E_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./j" : !transform.any_op
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_E_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./k" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg4 = %c0 to %c4 step %c1 {
+# CHECK-NEXT:       %subview = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 {
+# CHECK-NEXT:         %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:       %subview_10 = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 {
+# CHECK-NEXT:         %subview_14 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_15 = memref.subview %subview_9[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg6 = %c0_17 to %c512 step %c1_18 {
+# CHECK-NEXT:           %subview_19 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_20 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_21 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_D_} ins(%subview_19, %subview_20 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %cst_3 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg4 = %c0_4 to %c32 step %c1_5 {
+# CHECK-NEXT:       %subview = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 {
+# CHECK-NEXT:         %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:     %c32_7 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 {
+# CHECK-NEXT:       %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:       %subview_9 = memref.subview %alloc[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:       %subview_10 = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 {
+# CHECK-NEXT:         %subview_14 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:         %subview_15 = memref.subview %subview_9[0, %arg5] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_18 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg6 = %c0_17 to %c4_18 step %c1_19 {
+# CHECK-NEXT:           %subview_20 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:           %subview_21 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_22 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_E_} ins(%subview_20, %subview_21 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_22 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: matmul
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 4x512xfloat32
+# CHECK-NEXT:   - %1 : 512x32xfloat32
+# CHECK-NEXT:   - %2 : 32x4xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %4 : 32x32xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %3: matmul(%0, %1) {name = 'D'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:   - %4: matmul(%2, %3) {name = 'E'} : [32x4xfloat32, 4x32xfloat32] -> [32x32xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
+

From 89dd2f54d374919ddcf9bd385aed9b8b2481eb8f Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Fri, 6 Feb 2026 17:16:06 +0100
Subject: [PATCH 05/14] tensor-dialect: cleanup, added lowering check to tests

---
 src/xtc/backends/mlir/MlirCompiler.py         |  4 +-
 src/xtc/backends/mlir/MlirGraphBackend.py     | 19 ++++----
 src/xtc/backends/mlir/MlirOps.py              | 48 ++++++-------------
 .../tensor_dialect/test_matmul_mlir_tensor.py | 26 +++++++++-
 .../test_two_matmuls_mlir_tensor.py           | 34 ++++++++++++-
 5 files changed, 87 insertions(+), 44 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index 9d3e55354..babec3e33 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -154,6 +154,8 @@ def mlir_apply_tensor_lowering_pass(self) -> None:
         apply_transform_pass = MlirProgramApplyPasses(
             mlir_program=self._mlir_program,
         )
+        if self._config.print_bufferization_ir:
+            self.dump_ir("IR Dump Before Tensor Lowering")
         apply_transform_pass.run(
             [
                 "eliminate-empty-tensors",  # causes ops to write directly to out buffer
@@ -161,7 +163,7 @@ def mlir_apply_tensor_lowering_pass(self) -> None:
             ]
         )
         if self._config.print_bufferization_ir:
-            self.dump_ir("IR Dump After Lowering")
+            self.dump_ir("IR Dump After Tensor Lowering")
 
     def _save_temp(self, fname: str, content: Any) -> None:
         if not self._config.save_temps:
diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index d9aa9ed6c..e6236ce53 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -75,6 +75,12 @@ def _xdsl_generate_node(
         assert node.inputs_types is not None and node.outputs_types is not None
         types = [*node.inputs_types, *node.outputs_types]
         for name, type in zip(names, types):
+            if name in node.outputs and self.xdsl_type == TensorType:
+                with ImplicitBuilder(block):
+                    variables[name] = tensor.EmptyOp(
+                        dynamic_sizes=[],
+                        tensor_type=self._xdsl_type_from_tensortype(type),
+                    ).results[0]
             if name in variables:
                 continue
             with ImplicitBuilder(block):
@@ -82,7 +88,7 @@ def _xdsl_generate_node(
                 result_op = (
                     tensor.EmptyOp(
                         dynamic_sizes=[],
-                        tensor_type=self._xdsl_type_from_tensortype(type),
+                        tensor_type=TensorType(elt_type, shape),
                     )
                     if self.xdsl_type == TensorType
                     else memref.AllocaOp.get(
@@ -111,7 +117,7 @@ def _init_from_graph(
         )
         params_types = [
             self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type))
-            for tensor_type in inputs_types  # [*inputs_types, *outputs_types]
+            for tensor_type in inputs_types
         ]
         # graph output types are always memrefs
         params_types.extend(
@@ -133,15 +139,12 @@ def _init_from_graph(
         with ImplicitBuilder(inlined_block):
             if self.xdsl_type == TensorType:
                 assert last_node
-                reduce = bufferization.MaterializeInDestinationOp(
-                    # operands=((inlined_block.args[-1],), (last_node.results[0],)),
+                # write the final tensor value to the output buffer
+                dest = bufferization.MaterializeInDestinationOp(
                     operands=((last_node.results[0],), (inlined_block.args[-1],)),
-                    # result_types=(last_node.results[0].type,),
-                    # result_types=(inlined_block.args[-1].type,),
                     result_types=((),),
+                    attributes={"writable": UnitAttr(), "restrict": UnitAttr()},
                 )
-                reduce.attributes["writable"] = UnitAttr()
-                reduce.attributes["restrict"] = UnitAttr()
             func.ReturnOp()
         region = Region([inlined_block])  # type: ignore # issue with mypy
         payload = xdslFuncOp.from_region(
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index 088c4195e..c645f0f41 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -7,7 +7,7 @@
 from typing_extensions import override
 from typing import Any, Type, TypeAlias, cast
 
-from xdsl.dialects import linalg, arith, builtin, memref, tensor
+from xdsl.dialects import linalg, arith, builtin, memref
 from xdsl.dialects.builtin import (
     MemRefType,
     TensorType,
@@ -164,7 +164,7 @@ def generate_op(
             ops_types = [
                 self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj]]
             ]
-            ops_types.append(MemRefType(elt_type, [Ki, Kj]))
+            ops_types.append(TensorType(elt_type, [Ki, Kj]))
             block = Block(arg_types=ops_types)
             args = block.args
         has_tensor_result = isinstance(args[-1].type, TensorType)
@@ -173,37 +173,19 @@ def generate_op(
         assert not (has_tensor_result and self.op_type == MemRefType)
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
-
-            if self.op_type == MemRefType:
-                fill = linalg.FillOp(
-                    res=(),
-                    inputs=(cst0.results[0],),
-                    outputs=(args[2],),
-                )
-                reduce = linalg.MatmulOp(
-                    res=(),
-                    inputs=(args[0], args[1]),
-                    outputs=(args[2],),
-                )
-            else:
-                empty = (
-                    args[2]
-                    if has_tensor_result
-                    else tensor.EmptyOp(
-                        dynamic_sizes=[],
-                        tensor_type=TensorType(elt_type, [Ki, Kj]),
-                    ).results[0]
-                )
-                fill = linalg.FillOp(
-                    res=(empty.type,),
-                    inputs=(cst0.results[0],),
-                    outputs=(empty,),
-                )
-                reduce = linalg.MatmulOp(
-                    res=(fill.results[0].type,),
-                    inputs=(args[0], args[1]),
-                    outputs=(fill.results[0],),
-                )
+            result = (args[2].type,) if self.op_type == TensorType else ()
+            fill = linalg.FillOp(
+                res=result,
+                inputs=(cst0.results[0],),
+                outputs=(args[2],),
+            )
+            reduce = linalg.MatmulOp(
+                res=result,
+                inputs=(args[0], args[1]),
+                outputs=(fill.results[0],)
+                if self.op_type == TensorType
+                else (args[2],),
+            )
         fill_node_id = f"{self.name}_0"
         reduce_node_id = f"{self.name}"
         fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
index d363536ee..5a7ed668e 100644
--- a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
@@ -24,12 +24,36 @@
     dump_file="matmul_mlir_tensor",
     print_source_ir=True,
     print_transformed_ir=True,
+    print_bufferization_ir=True,
 )
 module = comp.compile(sched)
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>)
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
index a7ea1b96a..7846515e9 100644
--- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
@@ -27,12 +27,44 @@
     dump_file="two_matmul_mlir_tensor",
     print_source_ir=True,
     print_transformed_ir=True,
+    print_bufferization_ir=True,
 )
 module = comp.compile(sched)
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<32x32xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %4 = linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%3 : tensor<32x32xf32>) -> tensor<32x32xf32>
+# CHECK-NEXT:     %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %0 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>)
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
+# CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>

From f5a8c01d1587afd0ef03a93a16a14e66aebf2721 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Mon, 9 Feb 2026 15:58:29 +0100
Subject: [PATCH 06/14] tensor-dialect: added relu with collapsing shapes

---
 src/xtc/backends/mlir/MlirCompiler.py         |   1 +
 src/xtc/backends/mlir/MlirOps.py              |  57 +++--
 .../test_matmul_relu_mlir_tensor.py           | 204 ++++++++++++++++++
 .../test_two_matmuls_mlir_tensor.py           |  24 +--
 4 files changed, 260 insertions(+), 26 deletions(-)
 create mode 100644 tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py

diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index babec3e33..85d7d543b 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -160,6 +160,7 @@ def mlir_apply_tensor_lowering_pass(self) -> None:
             [
                 "eliminate-empty-tensors",  # causes ops to write directly to out buffer
                 "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map}",
+                "func.func(promote-buffers-to-stack)",
             ]
         )
         if self._config.print_bufferization_ir:
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index c645f0f41..e919d713b 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -7,7 +7,7 @@
 from typing_extensions import override
 from typing import Any, Type, TypeAlias, cast
 
-from xdsl.dialects import linalg, arith, builtin, memref
+from xdsl.dialects import linalg, arith, builtin, memref, tensor
 from xdsl.dialects.builtin import (
     MemRefType,
     TensorType,
@@ -391,13 +391,14 @@ def generate_op(
         elt_type = {"float32": f32, "float64": f64}[dtype]
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
-            ops_types = [MemRefType(elt_type, shape) for shape in [[Ki], [Ki]]]
+            ops_types = [self.op_type(elt_type, shape) for shape in [[Ki], [Ki]]]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 2
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         inp_shape, out_shape = [
-            list(cast(MemRefType, arg.type).get_shape()) for arg in args
+            list(cast(self.op_type, arg.type).get_shape())  # type: ignore
+            for arg in args
         ]
         inp_size, out_size = [mulall(shape) for shape in [inp_shape, out_shape]]
         assert inp_size == out_size
@@ -416,15 +417,32 @@ def generate_op(
                     )
                 ]
             )
-            inp = memref.CollapseShapeOp(
-                operands=[args[0]],
-                properties=dict(reassociation=inp_reassociation),
-                result_types=[MemRefType(elt_type, (inp_size,))],
-            )
-            out = memref.CollapseShapeOp(
-                operands=[args[1]],
-                properties=dict(reassociation=out_reassociation),
-                result_types=[MemRefType(elt_type, (out_size,))],
+            if self.op_type == TensorType:
+                inp = tensor.CollapseShapeOp(  # type: ignore
+                    operands=[args[0]],
+                    properties=dict(reassociation=inp_reassociation),
+                    result_types=[self.op_type(elt_type, (inp_size,))],
+                )
+                # create empty tensor for collapsed output shape
+                out_empty = tensor.EmptyOp([], TensorType(elt_type, [out_size]))
+                out_operand = out_empty.tensor
+            else:
+                inp = memref.CollapseShapeOp(  # type: ignore
+                    operands=[args[0]],
+                    properties=dict(reassociation=inp_reassociation),
+                    result_types=[self.op_type(elt_type, (inp_size,))],
+                )
+                out = memref.CollapseShapeOp(
+                    operands=[args[1]],
+                    properties=dict(reassociation=out_reassociation),
+                    result_types=[self.op_type(elt_type, (out_size,))],
+                )
+                out_operand = out.results[0]
+
+            result = (
+                (TensorType(elt_type, [out_size]),)
+                if self.op_type == TensorType
+                else ()
             )
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
             iterator_types = [
@@ -436,7 +454,7 @@ def generate_op(
                 linalg.YieldOp(max)
             relu = linalg.GenericOp(
                 inputs=(inp.results[0], cst0.results[0]),
-                outputs=(out.results[0],),
+                outputs=(out_operand,),
                 body=Region([block_in]),  # type: ignore # mypy issue with dataclass
                 # ignore typing due to xdsl hints limitation
                 indexing_maps=[
@@ -460,12 +478,23 @@ def generate_op(
                     ),
                 ],
                 iterator_types=iterator_types,
+                result_types=result,
             )
+            relu_result = None
+            if self.op_type == TensorType:
+                relu_result = tensor.ExpandShapeOp(
+                    relu.results[0],
+                    reassociation=out_reassociation,
+                    result_type=TensorType(elt_type, out_shape),
+                    static_output_shape=out_shape,
+                    dynamic_output_shape=[],
+                )
         relu_node_id = f"{self.name}"
         relu.attributes[f"__xtc_id_{relu_node_id}_"] = UnitAttr()
         attrs = {
             "nodes_map": {
                 relu_node_id: relu,
+                "return_node_id": relu_result,
             },
             "dims_sizes": [
                 self.dims_sizes(),
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
new file mode 100644
index 000000000..76379eb5c
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
@@ -0,0 +1,204 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="matmul_relu") as gb:
+    m = O.matmul(a, b, name="matmul")
+    O.relu(m, name="relu")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler(default_node="matmul")
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="matmul_relu_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<128xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %5 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapsed, %cst_0 : tensor<128xf32>, f32) outs(%4 : tensor<128xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
+# CHECK-NEXT:       %6 = arith.maximumf %in, %in_1 : f32
+# CHECK-NEXT:       linalg.yield %6 : f32
+# CHECK-NEXT:     } -> tensor<128xf32>
+# CHECK-NEXT:     %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
+# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_2: f32, %out: f32):
+# CHECK-NEXT:       %0 = arith.maximumf %in, %in_2 : f32
+# CHECK-NEXT:       linalg.yield %0 : f32
+# CHECK-NEXT:     }
+# CHECK-NEXT:     %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
+# CHECK-NEXT:     memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
+# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_2: f32, %out: f32):
+# CHECK-NEXT:       %0 = arith.maximumf %in, %in_2 : f32
+# CHECK-NEXT:       linalg.yield %0 : f32
+# CHECK-NEXT:     }
+# CHECK-NEXT:     %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
+# CHECK-NEXT:     memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_matmul_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./k" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./i" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0 to %c4 step %c1 {
+# CHECK-NEXT:       %subview = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_7 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_7 to %c32 step %c1_8 {
+# CHECK-NEXT:         %subview_9 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_9 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_7 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:       %subview_8 = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_9 to %c32 step %c1_10 {
+# CHECK-NEXT:         %subview_11 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_12 = memref.subview %subview_7[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_13 = memref.subview %subview_8[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_14 to %c512 step %c1_15 {
+# CHECK-NEXT:           %subview_16 = memref.subview %subview_11[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_17 = memref.subview %subview_12[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_18 = memref.subview %subview_13[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_} ins(%subview_16, %subview_17 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_18 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
+# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 64 : i64} : memref<128xf32>
+# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c128 = arith.constant 128 : index
+# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_5 to %c128 step %c1_6 {
+# CHECK-NEXT:       %subview = memref.subview %collapse_shape[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
+# CHECK-NEXT:       %subview_7 = memref.subview %alloca_3[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
+# CHECK-NEXT:       linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_4 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_7 : memref<1xf32, strided<[1], offset: ?>>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_8: f32, %out: f32):
+# CHECK-NEXT:         %0 = arith.maximumf %in, %in_8 : f32
+# CHECK-NEXT:         linalg.yield %0 : f32
+# CHECK-NEXT:       }
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %expand_shape = memref.expand_shape %alloca_3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
+# CHECK-NEXT:     memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: matmul_relu
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 4x512xfloat32
+# CHECK-NEXT:   - %1 : 512x32xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %3 : 4x32xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: matmul(%0, %1) {name = 'matmul'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:   - %3: relu(%2) {name = 'relu'} : [4x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
+
diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
index 7846515e9..efb890a66 100644
--- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
@@ -52,13 +52,13 @@
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
 # CHECK-NEXT: module {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
 # CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
@@ -67,13 +67,13 @@
 # CHECK-NEXT: // -----// IR Dump Before transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloc : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloc : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloc : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
 # CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
@@ -113,13 +113,13 @@
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     scf.for %arg4 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:       %subview = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       %c0_9 = arith.constant 0 : index
 # CHECK-NEXT:       %c32_10 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_11 = arith.constant 1 : index
@@ -134,7 +134,7 @@
 # CHECK-NEXT:     scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 {
 # CHECK-NEXT:       %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
 # CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_10 = memref.subview %alloc[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_10 = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:       %c32_12 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_13 = arith.constant 1 : index
@@ -172,7 +172,7 @@
 # CHECK-NEXT:     %c1_8 = arith.constant 1 : index
 # CHECK-NEXT:     scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 {
 # CHECK-NEXT:       %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>>
-# CHECK-NEXT:       %subview_9 = memref.subview %alloc[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:       %subview_9 = memref.subview %alloca[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
 # CHECK-NEXT:       %subview_10 = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:       %c32_12 = arith.constant 32 : index

From cbb5303bee8044b19c2bd4e2ba2595ea5323ad73 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Tue, 10 Feb 2026 11:44:42 +0100
Subject: [PATCH 07/14] tensor-dialect: added conv2d

---
 src/xtc/backends/mlir/MlirOps.py              |  23 +-
 .../test_conv2d_mini_mlir_tensor.py           | 233 ++++++++++++++++++
 2 files changed, 246 insertions(+), 10 deletions(-)
 create mode 100644 tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py

diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index e919d713b..e309149fe 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -162,15 +162,13 @@ def generate_op(
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                self.op_type(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj]]
+                self.op_type(elt_type, shape)
+                for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
             ]
-            ops_types.append(TensorType(elt_type, [Ki, Kj]))
             block = Block(arg_types=ops_types)
             args = block.args
-        has_tensor_result = isinstance(args[-1].type, TensorType)
         assert len(args) == 3
-        assert all(isinstance(arg.type, self.op_type) for arg in args[:-1])
-        assert not (has_tensor_result and self.op_type == MemRefType)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
             result = (args[2].type,) if self.op_type == TensorType else ()
@@ -250,7 +248,7 @@ def __init__(
         op_type: Type[MemRefType] | Type[TensorType] = MemRefType,
     ) -> None:
         attrs = {"stride": self.DEFAULT_STRIDE, **attrs}
-        super().__init__(args, attrs, name)
+        super().__init__(args, attrs, name, op_type)
 
     @override
     def dims(self, kind: str = "") -> tuple[str, ...]:
@@ -274,16 +272,17 @@ def generate_op(
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape) for shape in [*inps_dims, out_dims]
+                self.op_type(elt_type, shape) for shape in [*inps_dims, out_dims]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 3
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         with ImplicitBuilder(block):
+            result = (args[2].type,) if self.op_type == TensorType else ()
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
             fill = linalg.FillOp(
-                res=(),
+                res=result,
                 inputs=(cst0.results[0],),
                 outputs=(args[2],),
             )
@@ -306,7 +305,9 @@ def generate_op(
                 linalg.YieldOp(add)
             reduce = linalg.GenericOp(
                 inputs=(args[0], args[1]),
-                outputs=(args[2],),
+                outputs=(fill.results[0],)
+                if self.op_type == TensorType
+                else (args[2],),
                 body=Region([block_in]),  # type: ignore # mypy issue with dataclass
                 # ignore typing due to xdsl hints limitation
                 indexing_maps=[
@@ -330,6 +331,7 @@ def generate_op(
                     ),
                 ],
                 iterator_types=iterator_types,
+                result_types=result,
             )
         fill_node_id = f"{self.name}_0"
         reduce_node_id = f"{self.name}"
@@ -339,6 +341,7 @@ def generate_op(
             "nodes_map": {
                 fill_node_id: fill,
                 reduce_node_id: reduce,
+                "return_node_id": reduce,
             },
             "dims_sizes": [
                 {"b": Kb, "h": Kh, "w": Kw, "f": Kf},
diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
new file mode 100644
index 000000000..89e6a0e18
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
@@ -0,0 +1,233 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+# Small conv2d
+N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 3, 3, 3, 1, 1, "float32"
+a = O.tensor((N, H + R - 1, W + S - 1, C), dtype, name="I")
+b = O.tensor((R, S, C, F), dtype, name="W")
+
+with O.graph(name="conv2d_nhwc_mini") as gb:
+    O.conv2d(a, b, stride=(SH, SW), name="O")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="conv2d_nhwc_mini_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x8x8x16xf32>) -> tensor<1x8x8x16xf32>
+# CHECK-NEXT:     %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x10x10x3xf32>, tensor<3x3x3x16xf32>) outs(%1 : tensor<1x8x8x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
+# CHECK-NEXT:       %3 = arith.mulf %in, %in_0 : f32
+# CHECK-NEXT:       %4 = arith.addf %out, %3 : f32
+# CHECK-NEXT:       linalg.yield %4 : f32
+# CHECK-NEXT:     } -> tensor<1x8x8x16xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>)
+# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
+# CHECK-NEXT:       %0 = arith.mulf %in, %in_0 : f32
+# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
+# CHECK-NEXT:       linalg.yield %1 : f32
+# CHECK-NEXT:     }
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>)
+# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
+# CHECK-NEXT:       %0 = arith.mulf %in, %in_0 : f32
+# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
+# CHECK-NEXT:       linalg.yield %1 : f32
+# CHECK-NEXT:     }
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./f" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_19 "./c" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0 to %c1 step %c1_0 {
+# CHECK-NEXT:       %subview = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_4 to %c8 step %c1_5 {
+# CHECK-NEXT:         %subview_6 = memref.subview %subview[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_7 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_8 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_7 to %c8_8 step %c1_9 {
+# CHECK-NEXT:           %subview_10 = memref.subview %subview_6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:           scf.for %arg6 = %c0_11 to %c16 step %c1_12 {
+# CHECK-NEXT:             %subview_13 = memref.subview %subview_10[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>)
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:       %subview_5 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_6 to %c8 step %c1_7 {
+# CHECK-NEXT:         %subview_8 = memref.subview %subview[0, %arg4, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_9 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:         %subview_10 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 {
+# CHECK-NEXT:           %subview_14 = memref.subview %subview_8[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:           %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:           %subview_16 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:           scf.for %arg6 = %c0_17 to %c16 step %c1_18 {
+# CHECK-NEXT:             %subview_19 = memref.subview %subview_14[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_20 = memref.subview %subview_15[0, 0, 0, %arg6] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %subview_21 = memref.subview %subview_16[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:             %c3 = arith.constant 3 : index
+# CHECK-NEXT:             %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:             scf.for %arg7 = %c0_22 to %c3 step %c1_23 {
+# CHECK-NEXT:               %subview_24 = memref.subview %subview_19[0, %arg7, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_25 = memref.subview %subview_20[%arg7, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %subview_26 = memref.subview %subview_21[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:               %c0_27 = arith.constant 0 : index
+# CHECK-NEXT:               %c3_28 = arith.constant 3 : index
+# CHECK-NEXT:               %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:               scf.for %arg8 = %c0_27 to %c3_28 step %c1_29 {
+# CHECK-NEXT:                 %subview_30 = memref.subview %subview_24[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_31 = memref.subview %subview_25[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_32 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %c0_33 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3_34 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_35 = arith.constant 1 : index
+# CHECK-NEXT:                 scf.for %arg9 = %c0_33 to %c3_34 step %c1_35 {
+# CHECK-NEXT:                   %subview_36 = memref.subview %subview_30[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_37 = memref.subview %subview_31[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_38 = memref.subview %subview_32[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_36, %subview_37 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_39: f32, %out: f32):
+# CHECK-NEXT:                     %0 = arith.mulf %in, %in_39 : f32
+# CHECK-NEXT:                     %1 = arith.addf %out, %0 : f32
+# CHECK-NEXT:                     linalg.yield %1 : f32
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: conv2d_nhwc_mini
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 1x10x10x3xfloat32
+# CHECK-NEXT:   - %1 : 3x3x3x16xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %2 : 1x8x8x16xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: conv2d(%0, %1, stride=(1, 1)) {name = 'O'} : [1x10x10x3xfloat32, 3x3x3x16xfloat32] -> [1x8x8x16xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
+

From 3759374b80c84615942836aa56a5b0d03449deff Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Tue, 10 Feb 2026 16:25:18 +0100
Subject: [PATCH 08/14] tensor-dialect: pad, unpad, memory alignment change

---
 src/xtc/backends/mlir/MlirCompiler.py         |   2 +-
 src/xtc/backends/mlir/MlirOps.py              |  76 +++--
 .../test_matmul_relu_mlir_tensor.py           |  13 +-
 .../test_pad_conv2d_mlir_tensor.py            | 298 ++++++++++++++++++
 .../test_pad_matmul_unpad_mlir_tensor.py      | 242 ++++++++++++++
 .../test_two_matmuls_mlir_tensor.py           |   8 +-
 6 files changed, 598 insertions(+), 41 deletions(-)
 create mode 100644 tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
 create mode 100644 tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py

diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index 85d7d543b..7535f31a5 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -159,7 +159,7 @@ def mlir_apply_tensor_lowering_pass(self) -> None:
         apply_transform_pass.run(
             [
                 "eliminate-empty-tensors",  # causes ops to write directly to out buffer
-                "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map}",
+                "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map buffer-alignment=256}",
                 "func.func(promote-buffers-to-stack)",
             ]
         )
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index e309149fe..1ad47bc91 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -566,7 +566,7 @@ def generate_op(
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 2
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         if isinstance(padding, dict):
             offsets = [0 for _ in self.args[:-1]]
             for i, (pad_b, pad_a) in padding.items():
@@ -577,23 +577,33 @@ def generate_op(
         strides = [1 for _ in self.args[:-1]]
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size))
+            result = (args[1].type,) if self.op_type == TensorType else ()
             fill = linalg.FillOp(
-                res=(),
+                res=result,
                 inputs=(cst0.results[0],),
                 outputs=(args[1],),
             )
-            subview = memref.SubviewOp.from_static_parameters(
-                source=args[1],
-                source_type=args[1].type,  # type: ignore
-                offsets=offsets,
-                sizes=sizes,
-                strides=strides,
-            )
-            copy = linalg.CopyOp(
-                inputs=[args[0]],
-                outputs=[subview.result],
-                res=(),
-            )
+            if self.op_type == TensorType:
+                copy = tensor.InsertSliceOp.from_static_parameters(
+                    source=args[0],
+                    dest=fill.results[0],
+                    offsets=offsets,
+                    sizes=sizes,
+                    strides=strides,
+                )
+            else:
+                subview = memref.SubviewOp.from_static_parameters(
+                    source=args[1],
+                    source_type=args[1].type,  # type: ignore
+                    offsets=offsets,
+                    sizes=sizes,
+                    strides=strides,
+                )
+                copy = linalg.CopyOp(  # type: ignore
+                    inputs=[args[0]],
+                    outputs=[subview.result],
+                    res=result,
+                )
         fill_node_id = f"{self.name}_0"
         fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
         copy_node_id = f"{self.name}"
@@ -602,6 +612,7 @@ def generate_op(
             "nodes_map": {
                 fill_node_id: fill,
                 copy_node_id: copy,
+                "return_node_id": copy,
             },
             "dims_sizes": [
                 self.dims_sizes(),
@@ -674,13 +685,13 @@ def generate_op(
         elt_type = {"float32": f32, "float64": f64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape)
+                self.op_type(elt_type, shape)
                 for shape in [dims_values_before_unpad, dims_values]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 2
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         if isinstance(padding, dict):
             offsets = [0 for _ in self.args[:-1]]
             for i, (pad_b, _) in padding.items():
@@ -690,23 +701,32 @@ def generate_op(
         sizes = dims_values
         strides = [1 for _ in self.args[:-1]]
         with ImplicitBuilder(block):
-            subview = memref.SubviewOp.from_static_parameters(
-                source=args[0],
-                source_type=args[0].type,  # type: ignore
-                offsets=offsets,
-                sizes=sizes,
-                strides=strides,
-            )
-            copy = linalg.CopyOp(
-                inputs=[subview.result],
-                outputs=[args[1]],
-                res=(),
-            )
+            if self.op_type == TensorType:
+                copy = tensor.ExtractSliceOp.from_static_parameters(
+                    source=args[0],
+                    offsets=offsets,
+                    sizes=sizes,
+                    strides=strides,
+                )
+            else:
+                subview = memref.SubviewOp.from_static_parameters(
+                    source=args[0],
+                    source_type=args[0].type,  # type: ignore
+                    offsets=offsets,
+                    sizes=sizes,
+                    strides=strides,
+                )
+                copy = linalg.CopyOp(  # type: ignore
+                    inputs=[subview.result],
+                    outputs=[args[1]],
+                    res=(),
+                )
         copy_node_id = f"{self.name}"
         copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr()
         attrs = {
             "nodes_map": {
                 copy_node_id: copy,
+                "return_node_id": copy,
             },
             "dims_sizes": [
                 self.dims_sizes(),
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
index 76379eb5c..037bc5f53 100644
--- a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
@@ -60,12 +60,12 @@
 # CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
 # CHECK-NEXT: module {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
-# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
 # CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs =  {__xtc_id_relu_} {
 # CHECK-NEXT:     ^bb0(%in: f32, %in_2: f32, %out: f32):
@@ -83,12 +83,12 @@
 # CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
-# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<128xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
 # CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs =  {__xtc_id_relu_} {
 # CHECK-NEXT:     ^bb0(%in: f32, %in_2: f32, %out: f32):
@@ -128,7 +128,7 @@
 # CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
@@ -169,7 +169,7 @@
 # CHECK-NEXT:       } {"./j"}
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
-# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 64 : i64} : memref<128xf32>
+# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
 # CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0_5 = arith.constant 0 : index
 # CHECK-NEXT:     %c128 = arith.constant 128 : index
@@ -201,4 +201,3 @@
 # CHECK-NEXT:   - %3: relu(%2) {name = 'relu'} : [4x32xfloat32] -> [4x32xfloat32]
 # CHECK-NEXT:  
 # CHECK-NEXT: CODE: 0
-
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
new file mode 100644
index 000000000..68c2c7617
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
@@ -0,0 +1,298 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: module_mlir
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+# Small conv2d
+N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 5, 5, 3, 2, 2, "float32"
+a = O.tensor((N, H, W, C), dtype, name="I")
+b = O.tensor((R, S, C, F), dtype, name="W")
+
+with O.graph(name="pad_conv2d_nhwc_mini") as gb:
+    p = O.pad2d(a, padding=2, axes=(1, 2), name="pad")
+    O.conv2d(p, b, stride=(SH, SW), name="conv")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="pad_conv2d_nhwc_mini_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%0 : tensor<1x12x12x3xf32>) -> tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %3 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%2 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%0, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
+# CHECK-NEXT:       %5 = arith.mulf %in, %in_1 : f32
+# CHECK-NEXT:       %6 = arith.addf %out, %5 : f32
+# CHECK-NEXT:       linalg.yield %6 : f32
+# CHECK-NEXT:     } -> tensor<1x4x4x16xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>)
+# CHECK-NEXT:     %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
+# CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
+# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
+# CHECK-NEXT:       %0 = arith.mulf %in, %in_1 : f32
+# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
+# CHECK-NEXT:       linalg.yield %1 : f32
+# CHECK-NEXT:     }
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>)
+# CHECK-NEXT:     %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
+# CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
+# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
+# CHECK-NEXT:       %0 = arith.mulf %in, %in_1 : f32
+# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
+# CHECK-NEXT:       linalg.yield %1 : f32
+# CHECK-NEXT:     }
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./c" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./c" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_19 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_21 "./f" : !transform.any_op
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_23 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_25 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_27 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_29 "./f" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_31 "./r" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_33 "./s" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_35 "./c" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0 to %c1 step %c1_0 {
+# CHECK-NEXT:       %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c12 = arith.constant 12 : index
+# CHECK-NEXT:       %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_9 to %c12 step %c1_10 {
+# CHECK-NEXT:         %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:         %c12_13 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_12 to %c12_13 step %c1_14 {
+# CHECK-NEXT:           %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:           %c3 = arith.constant 3 : index
+# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:           scf.for %arg6 = %c0_16 to %c3 step %c1_17 {
+# CHECK-NEXT:             %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:           } {"./c"}
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
+# CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 {
+# CHECK-NEXT:       %subview_8 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_9 to %c4 step %c1_10 {
+# CHECK-NEXT:         %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_13 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_12 to %c4_13 step %c1_14 {
+# CHECK-NEXT:           %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:           scf.for %arg6 = %c0_16 to %c16 step %c1_17 {
+# CHECK-NEXT:             %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 {
+# CHECK-NEXT:       %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:       %subview_10 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_11 to %c4 step %c1_12 {
+# CHECK-NEXT:         %0 = affine.apply #map(%arg4)
+# CHECK-NEXT:         %subview_13 = memref.subview %subview_8[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_14 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:         %subview_15 = memref.subview %subview_10[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_17 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_16 to %c4_17 step %c1_18 {
+# CHECK-NEXT:           %1 = affine.apply #map(%arg5)
+# CHECK-NEXT:           %subview_19 = memref.subview %subview_13[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           %subview_20 = memref.subview %subview_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:           %subview_21 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:           scf.for %arg6 = %c0_22 to %c16 step %c1_23 {
+# CHECK-NEXT:             %subview_24 = memref.subview %subview_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_25 = memref.subview %subview_20[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %subview_26 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             %c0_27 = arith.constant 0 : index
+# CHECK-NEXT:             %c5 = arith.constant 5 : index
+# CHECK-NEXT:             %c1_28 = arith.constant 1 : index
+# CHECK-NEXT:             scf.for %arg7 = %c0_27 to %c5 step %c1_28 {
+# CHECK-NEXT:               %subview_29 = memref.subview %subview_24[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_30 = memref.subview %subview_25[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %subview_31 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:               %c0_32 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_33 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_34 = arith.constant 1 : index
+# CHECK-NEXT:               scf.for %arg8 = %c0_32 to %c5_33 step %c1_34 {
+# CHECK-NEXT:                 %subview_35 = memref.subview %subview_29[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_36 = memref.subview %subview_30[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_37 = memref.subview %subview_31[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %c0_38 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_39 = arith.constant 1 : index
+# CHECK-NEXT:                 scf.for %arg9 = %c0_38 to %c3 step %c1_39 {
+# CHECK-NEXT:                   %subview_40 = memref.subview %subview_35[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_41 = memref.subview %subview_36[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_42 = memref.subview %subview_37[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_41 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_42 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_43: f32, %out: f32):
+# CHECK-NEXT:                     %2 = arith.mulf %in, %in_43 : f32
+# CHECK-NEXT:                     %3 = arith.addf %out, %2 : f32
+# CHECK-NEXT:                     linalg.yield %3 : f32
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: pad_conv2d_nhwc_mini
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 1x8x8x3xfloat32
+# CHECK-NEXT:   - %1 : 5x5x3x16xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %3 : 1x4x4x16xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: pad2d(%0, padding={1: (2, 2), 2: (2, 2)}, constant_value=0) {name = 'pad'} : [1x8x8x3xfloat32] -> [1x12x12x3xfloat32]
+# CHECK-NEXT:   - %3: conv2d(%2, %1, stride=(2, 2)) {name = 'conv'} : [1x12x12x3xfloat32, 5x5x3x16xfloat32] -> [1x4x4x16xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
new file mode 100644
index 000000000..34c09d96e
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
@@ -0,0 +1,242 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: module_mlir
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 14, 14, 14, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="pad_matmul_unpad") as gb:
+    p1 = O.pad(a, padding=(0, 2), name="A_pad")
+    p2 = O.pad(b, padding=(0, 2), name="B_pad")
+    m_pad = O.matmul(p1, p2, name="matmul_padded")
+    O.unpad(m_pad, padding=(0, 2), name="C")
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+sch = impl.get_scheduler(default_node="matmul_padded")
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="gen_pad_tuple_matmul_unpad_mlir",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%0 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %3 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %inserted_slice_1 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_2 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %5 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%4 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%0, %2 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %7 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>)
+# CHECK-NEXT:     %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>)
+# CHECK-NEXT:     %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>)
+# CHECK-NEXT:     %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>)
+# CHECK-NEXT:     %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>)
+# CHECK-NEXT:     %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>)
+# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>)
+# CHECK-NEXT:     %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./j" : !transform.any_op
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./j" : !transform.any_op
+# CHECK-NEXT:     %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./j" : !transform.any_op
+# CHECK-NEXT:     %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_19 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_21 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_23 "./k" : !transform.any_op
+# CHECK-NEXT:     %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_25 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_27 "./j" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0 to %c16 step %c1 {
+# CHECK-NEXT:       %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 {
+# CHECK-NEXT:         %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_3 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 {
+# CHECK-NEXT:       %subview_15 = memref.subview %alloca_0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 {
+# CHECK-NEXT:         %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %subview_5 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %arg1, %subview_5 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %cst_7 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_9 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 {
+# CHECK-NEXT:       %subview_15 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 {
+# CHECK-NEXT:         %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_12 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:     scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 {
+# CHECK-NEXT:       %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_16 = memref.subview %alloca_0[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
+# CHECK-NEXT:       %subview_17 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_18 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_19 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:       scf.for %arg4 = %c0_18 to %c16_19 step %c1_20 {
+# CHECK-NEXT:         %subview_21 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_22 = memref.subview %subview_16[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_23 = memref.subview %subview_17[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_25 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_26 = arith.constant 1 : index
+# CHECK-NEXT:         scf.for %arg5 = %c0_24 to %c16_25 step %c1_26 {
+# CHECK-NEXT:           %subview_27 = memref.subview %subview_21[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_28 = memref.subview %subview_22[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_29 = memref.subview %subview_23[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %subview_14 = memref.subview %alloca_6[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %subview_14, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: pad_matmul_unpad
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 14x14xfloat32
+# CHECK-NEXT:   - %1 : 14x14xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %5 : 14x14xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: pad(%0, padding=(0, 2), constant_value=0) {name = 'A_pad'} : [14x14xfloat32] -> [16x16xfloat32]
+# CHECK-NEXT:   - %3: pad(%1, padding=(0, 2), constant_value=0) {name = 'B_pad'} : [14x14xfloat32] -> [16x16xfloat32]
+# CHECK-NEXT:   - %4: matmul(%2, %3) {name = 'matmul_padded'} : [16x16xfloat32, 16x16xfloat32] -> [16x16xfloat32]
+# CHECK-NEXT:   - %5: unpad(%4, padding=(0, 2)) {name = 'C'} : [16x16xfloat32] -> [14x14xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
index efb890a66..c748923b8 100644
--- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
@@ -17,7 +17,6 @@
 print(graph)
 
 impl = Backend(graph, use_tensor_dialect=True)
-#impl = Backend(graph, use_tensor_dialect=False)
 
 sch = impl.get_scheduler(default_node = "E")
 sched = sch.schedule()
@@ -52,7 +51,7 @@
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
 # CHECK-NEXT: module {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
@@ -67,7 +66,7 @@
 # CHECK-NEXT: // -----// IR Dump Before transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
 # CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
@@ -113,7 +112,7 @@
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 64 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
@@ -210,4 +209,3 @@
 # CHECK-NEXT:   - %4: matmul(%2, %3) {name = 'E'} : [32x4xfloat32, 4x32xfloat32] -> [32x32xfloat32]
 # CHECK-NEXT:  
 # CHECK-NEXT: CODE: 0
-

From 58ffe404a0591d0af452ee27bf3b651d68d81695 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Wed, 11 Feb 2026 13:50:17 +0100
Subject: [PATCH 09/14] tensor-dialect: changed to pass mac tests, moved
 bufferize pass

---
 src/xtc/backends/mlir/MlirCompiler.py       | 15 ++++-----------
 src/xtc/backends/mlir/MlirCompilerPasses.py | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index 7535f31a5..6f3538e68 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -20,7 +20,7 @@
 from xtc.backends.mlir.MlirCompilerPasses import (
     MlirProgramInsertTransformPass,
     MlirProgramApplyTransformPass,
-    MlirProgramApplyPasses,
+    apply_bufferization_passes,
 )
 
 from xtc.backends.mlir.MlirTarget import (
@@ -151,18 +151,11 @@ def mlir_apply_transform_pass(self) -> None:
             self.dump_ir("IR Dump After transform")
 
     def mlir_apply_tensor_lowering_pass(self) -> None:
-        apply_transform_pass = MlirProgramApplyPasses(
-            mlir_program=self._mlir_program,
-        )
         if self._config.print_bufferization_ir:
             self.dump_ir("IR Dump Before Tensor Lowering")
-        apply_transform_pass.run(
-            [
-                "eliminate-empty-tensors",  # causes ops to write directly to out buffer
-                "one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map buffer-alignment=256}",
-                "func.func(promote-buffers-to-stack)",
-            ]
-        )
+
+        apply_bufferization_passes(self._mlir_program)
+
         if self._config.print_bufferization_ir:
             self.dump_ir("IR Dump After Tensor Lowering")
 
diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index 7a7e2da7e..dc228c157 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -26,6 +26,7 @@
     OpResult,
 )
 from mlir.passmanager import PassManager
+import platform
 
 # Import SDist if available
 try:
@@ -549,3 +550,22 @@ def run(self, pass_names: list[str]) -> None:
         for name in pass_names:
             pm.add(name)  # type: ignore # no attribute add
         pm.run(self._mlir_program.mlir_module.operation)
+
+
+def apply_bufferization_passes(mlir_program: RawMlirProgram):
+    apply_passes = MlirProgramApplyPasses(mlir_program)
+    bufferize_options = [
+        "bufferize-function-boundaries=1",
+        "function-boundary-type-conversion=identity-layout-map",
+        "buffer-alignment=256",
+    ]
+    # needed for now because macos mlir version needs to be updated
+    if platform.system() != "Darwin":
+        bufferize_options.append("buffer-alignment=256")
+    apply_passes.run(
+        [
+            "eliminate-empty-tensors",  # causes ops to write directly to out buffer
+            f"one-shot-bufferize{{{' '.join(bufferize_options)}}}",
+            "func.func(promote-buffers-to-stack)",
+        ]
+    )

From e2c88ce14e7abbb0dc01a18ac06ef07993832db5 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Fri, 13 Feb 2026 11:59:41 +0100
Subject: [PATCH 10/14] tensor-dialect: moved bufferization to after transform

---
 src/xtc/backends/mlir/MlirCompiler.py         |   4 +-
 src/xtc/backends/mlir/MlirCompilerPasses.py   |  11 +-
 src/xtc/backends/mlir/MlirGraphBackend.py     |  24 +-
 src/xtc/backends/mlir/MlirOps.py              |  18 +-
 .../test_conv2d_mini_mlir_tensor.py           | 433 ++++++++---
 .../test_conv2d_r181_mlir_tensor.py           | 723 ++++++++++++++++++
 .../tensor_dialect/test_matmul_mlir_tensor.py | 197 +++--
 .../test_matmul_relu_mlir_tensor.py           | 398 +++++++---
 .../test_pad_conv2d_mlir_tensor.py            | 609 +++++++++++----
 .../test_pad_matmul_unpad_mlir_tensor.py      | 379 ++++++---
 .../test_two_matmuls_mlir_tensor.py           | 361 +++++++--
 11 files changed, 2560 insertions(+), 597 deletions(-)
 create mode 100644 tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py

diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index 6f3538e68..8a761f87d 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -200,12 +200,12 @@ def compile(self) -> None:
 
         save_temp(src_ir_dump_file, self._mlir_program.mlir_module)
 
-        self.mlir_apply_tensor_lowering_pass()
-
         self.mlir_insert_transform_pass()
         save_temp(mlir_btrn_dump_file, self._mlir_program.mlir_module)
 
         self.mlir_apply_transform_pass()
         save_temp(mlir_atrn_dump_file, self._mlir_program.mlir_module)
 
+        self.mlir_apply_tensor_lowering_pass()
+
         self._target.generate_code_for_target(self._mlir_program, dump_file=dump_file)
diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index dc228c157..a52e6aa21 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -557,15 +557,22 @@ def apply_bufferization_passes(mlir_program: RawMlirProgram):
     bufferize_options = [
         "bufferize-function-boundaries=1",
         "function-boundary-type-conversion=identity-layout-map",
-        "buffer-alignment=256",
     ]
-    # needed for now because macos mlir version needs to be updated
+    # TODO: below is needed until macos mlir is updated
     if platform.system() != "Darwin":
         bufferize_options.append("buffer-alignment=256")
     apply_passes.run(
         [
             "eliminate-empty-tensors",  # causes ops to write directly to out buffer
             f"one-shot-bufferize{{{' '.join(bufferize_options)}}}",
+            "func.func(buffer-hoisting)",
+            "func.func(buffer-loop-hoisting)",
+            "drop-equivalent-buffer-results",
             "func.func(promote-buffers-to-stack)",
         ]
     )
+
+
+def pre_transform_tensor_passes(mlir_program: RawMlirProgram):
+    apply_passes = MlirProgramApplyPasses(mlir_program)
+    # apply_passes.run(["eliminate-empty-tensors"])
diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index e6236ce53..0c05ebd16 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -83,24 +83,23 @@ def _xdsl_generate_node(
                     ).results[0]
             if name in variables:
                 continue
+            assert self.xdsl_type != TensorType
             with ImplicitBuilder(block):
                 elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
-                result_op = (
-                    tensor.EmptyOp(
-                        dynamic_sizes=[],
-                        tensor_type=TensorType(elt_type, shape),
-                    )
-                    if self.xdsl_type == TensorType
-                    else memref.AllocaOp.get(
-                        return_type=elt_type,
-                        shape=shape,
-                        alignment=256,  # Take the default of dlpack lib
-                    )
+                alloca = memref.AllocaOp.get(
+                    return_type=elt_type,
+                    shape=shape,
+                    alignment=256,  # Take the default of dlpack lib
                 )
-            variables[name] = result_op.results[0]
+            variables[name] = alloca.results[0]
         args = [variables[name] for name in names]
         _, attrs = operation.generate(block=block, args=args)
         last_node = attrs["nodes_map"].get("return_node_id")
+        # the tensor dialect needs the result of the op, not the alloca
+        if self.xdsl_type == TensorType:
+            # for name in node.outputs:
+            assert len(node.outputs) == 1
+            variables[node.outputs[0]] = last_node.results[0]
         return attrs, last_node
 
     def _init_from_graph(
@@ -131,6 +130,7 @@ def _init_from_graph(
         }
         block_attrs = []
         last_node = None
+
         for node in graph.nodes.values():
             node_attrs, last_node = self._xdsl_generate_node(
                 node, inlined_block, variables
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index 1ad47bc91..df8a4aba0 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -575,15 +575,16 @@ def generate_op(
             offsets = [padding[0] for _ in self.args[:-1]]
         sizes = list(dims_value_before_pad)
         strides = [1 for _ in self.args[:-1]]
+        using_tensors = self.op_type == TensorType
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size))
-            result = (args[1].type,) if self.op_type == TensorType else ()
+            result = (args[1].type,) if using_tensors else ()
             fill = linalg.FillOp(
                 res=result,
                 inputs=(cst0.results[0],),
                 outputs=(args[1],),
             )
-            if self.op_type == TensorType:
+            if using_tensors:
                 copy = tensor.InsertSliceOp.from_static_parameters(
                     source=args[0],
                     dest=fill.results[0],
@@ -611,12 +612,12 @@ def generate_op(
         attrs = {
             "nodes_map": {
                 fill_node_id: fill,
-                copy_node_id: copy,
+                copy_node_id: None if using_tensors else copy,
                 "return_node_id": copy,
             },
             "dims_sizes": [
                 self.dims_sizes(),
-                self.dims_sizes(),
+                *([] if using_tensors else [self.dims_sizes()]),
             ],
         }
         return block, attrs
@@ -700,8 +701,9 @@ def generate_op(
             offsets = [padding[0] for _ in self.args[:-1]]
         sizes = dims_values
         strides = [1 for _ in self.args[:-1]]
+        using_tensors = self.op_type == TensorType
         with ImplicitBuilder(block):
-            if self.op_type == TensorType:
+            if using_tensors:
                 copy = tensor.ExtractSliceOp.from_static_parameters(
                     source=args[0],
                     offsets=offsets,
@@ -725,12 +727,10 @@ def generate_op(
         copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr()
         attrs = {
             "nodes_map": {
-                copy_node_id: copy,
+                copy_node_id: None if using_tensors else copy,
                 "return_node_id": copy,
             },
-            "dims_sizes": [
-                self.dims_sizes(),
-            ],
+            "dims_sizes": [*([] if using_tensors else [self.dims_sizes()])],
         }
         return block, attrs
 
diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
index 89e6a0e18..bd8db60bd 100644
--- a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
@@ -31,11 +31,12 @@
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+
+# CHECK: // -----// IR Dump Before transform //----- //
 # CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
 # CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
 # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-# CHECK-NEXT: module {
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<1x8x8x16xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
@@ -49,44 +50,6 @@
 # CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
-# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-# CHECK-NEXT: module {
-# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>)
-# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
-# CHECK-NEXT:       %0 = arith.mulf %in, %in_0 : f32
-# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
-# CHECK-NEXT:       linalg.yield %1 : f32
-# CHECK-NEXT:     }
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump Before transform //----- //
-# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
-# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-# CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%arg2 : memref<1x8x8x16xf32>)
-# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : memref<1x10x10x3xf32>, memref<3x3x3x16xf32>) outs(%arg2 : memref<1x8x8x16xf32>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
-# CHECK-NEXT:       %0 = arith.mulf %in, %in_0 : f32
-# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
-# CHECK-NEXT:       linalg.yield %1 : f32
-# CHECK-NEXT:     }
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
 # CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
 # CHECK-NEXT:     transform.yield 
@@ -125,96 +88,373 @@
 # CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
 # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x8x8x16xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_0 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0 to %c1 step %c1_0 {
-# CHECK-NEXT:       %subview = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
 # CHECK-NEXT:       %c0_4 = arith.constant 0 : index
 # CHECK-NEXT:       %c8 = arith.constant 8 : index
 # CHECK-NEXT:       %c1_5 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_4 to %c8 step %c1_5 {
-# CHECK-NEXT:         %subview_6 = memref.subview %subview[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
 # CHECK-NEXT:         %c0_7 = arith.constant 0 : index
 # CHECK-NEXT:         %c8_8 = arith.constant 8 : index
 # CHECK-NEXT:         %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_7 to %c8_8 step %c1_9 {
-# CHECK-NEXT:           %subview_10 = memref.subview %subview_6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_12 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_12 = arith.constant 1 : index
-# CHECK-NEXT:           scf.for %arg6 = %c0_11 to %c16 step %c1_12 {
-# CHECK-NEXT:             %subview_13 = memref.subview %subview_10[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>)
+# CHECK-NEXT:           %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32>
 # CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32>
 # CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32>
 # CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
 # CHECK-NEXT:     } {"./b"}
 # CHECK-NEXT:     %c0_1 = arith.constant 0 : index
 # CHECK-NEXT:     %c1_2 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 {
-# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:       %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:       %subview_5 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
+# CHECK-NEXT:       %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:             %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:             %c3 = arith.constant 3 : index
+# CHECK-NEXT:             %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:             %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32>
+# CHECK-NEXT:               %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:               %c3_31 = arith.constant 3 : index
+# CHECK-NEXT:               %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:               %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3_38 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_39 = arith.constant 1 : index
+# CHECK-NEXT:                 %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_45: f32, %out: f32):
+# CHECK-NEXT:                     %10 = arith.mulf %in, %in_45 : f32
+# CHECK-NEXT:                     %11 = arith.addf %out, %10 : f32
+# CHECK-NEXT:                     linalg.yield %11 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
+# CHECK-NEXT:       %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
+# CHECK-NEXT:         %c0_7 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_8 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
 # CHECK-NEXT:       %c0_6 = arith.constant 0 : index
 # CHECK-NEXT:       %c8 = arith.constant 8 : index
 # CHECK-NEXT:       %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_6 to %c8 step %c1_7 {
-# CHECK-NEXT:         %subview_8 = memref.subview %subview[0, %arg4, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:         %subview_9 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:         %subview_10 = memref.subview %subview_5[0, %arg4, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
 # CHECK-NEXT:         %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:         %c8_12 = arith.constant 8 : index
 # CHECK-NEXT:         %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_11 to %c8_12 step %c1_13 {
-# CHECK-NEXT:           %subview_14 = memref.subview %subview_8[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:           %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:           %subview_16 = memref.subview %subview_10[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:           scf.for %arg6 = %c0_17 to %c16 step %c1_18 {
-# CHECK-NEXT:             %subview_19 = memref.subview %subview_14[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:             %subview_20 = memref.subview %subview_15[0, 0, 0, %arg6] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:             %subview_21 = memref.subview %subview_16[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:             %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_24 = arith.constant 0 : index
 # CHECK-NEXT:             %c3 = arith.constant 3 : index
-# CHECK-NEXT:             %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:             scf.for %arg7 = %c0_22 to %c3 step %c1_23 {
-# CHECK-NEXT:               %subview_24 = memref.subview %subview_19[0, %arg7, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:               %subview_25 = memref.subview %subview_20[%arg7, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:               %subview_26 = memref.subview %subview_21[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:               %c0_27 = arith.constant 0 : index
-# CHECK-NEXT:               %c3_28 = arith.constant 3 : index
-# CHECK-NEXT:               %c1_29 = arith.constant 1 : index
-# CHECK-NEXT:               scf.for %arg8 = %c0_27 to %c3_28 step %c1_29 {
-# CHECK-NEXT:                 %subview_30 = memref.subview %subview_24[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_31 = memref.subview %subview_25[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_32 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %c0_33 = arith.constant 0 : index
-# CHECK-NEXT:                 %c3_34 = arith.constant 3 : index
-# CHECK-NEXT:                 %c1_35 = arith.constant 1 : index
-# CHECK-NEXT:                 scf.for %arg9 = %c0_33 to %c3_34 step %c1_35 {
-# CHECK-NEXT:                   %subview_36 = memref.subview %subview_30[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_37 = memref.subview %subview_31[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_38 = memref.subview %subview_32[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_36, %subview_37 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                   ^bb0(%in: f32, %in_39: f32, %out: f32):
-# CHECK-NEXT:                     %0 = arith.mulf %in, %in_39 : f32
-# CHECK-NEXT:                     %1 = arith.addf %out, %0 : f32
-# CHECK-NEXT:                     linalg.yield %1 : f32
+# CHECK-NEXT:             %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:             %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32>
+# CHECK-NEXT:               %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:               %c3_31 = arith.constant 3 : index
+# CHECK-NEXT:               %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:               %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3_38 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_39 = arith.constant 1 : index
+# CHECK-NEXT:                 %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_45: f32, %out: f32):
+# CHECK-NEXT:                     %10 = arith.mulf %in, %in_45 : f32
+# CHECK-NEXT:                     %11 = arith.addf %out, %10 : f32
+# CHECK-NEXT:                     linalg.yield %11 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %subview) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_9 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0_8 to %c8_9 step %c1_10 iter_args(%arg8 = %subview_7) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_12 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:           %4 = scf.for %arg9 = %c0_13 to %c16 step %c1_14 iter_args(%arg10 = %subview_12) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_16 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>)
+# CHECK-NEXT:             %subview_17 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %subview_16, %subview_17 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %subview_15 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %4, %subview_15 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_11 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %subview_5) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_9 = memref.subview %subview[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_10 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:         %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_13 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0_12 to %c8_13 step %c1_14 iter_args(%arg8 = %subview_11) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_16 = memref.subview %subview_9[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:           %subview_17 = memref.subview %subview_10[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
+# CHECK-NEXT:           %subview_18 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:           %4 = scf.for %arg9 = %c0_19 to %c16 step %c1_20 iter_args(%arg10 = %subview_18) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_22 = memref.subview %subview_16[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_23 = memref.subview %subview_17[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %subview_24 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             %c0_25 = arith.constant 0 : index
+# CHECK-NEXT:             %c3 = arith.constant 3 : index
+# CHECK-NEXT:             %c1_26 = arith.constant 1 : index
+# CHECK-NEXT:             %5 = scf.for %arg11 = %c0_25 to %c3 step %c1_26 iter_args(%arg12 = %subview_24) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_28 = memref.subview %subview_22[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_29 = memref.subview %subview_23[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:               %c0_31 = arith.constant 0 : index
+# CHECK-NEXT:               %c3_32 = arith.constant 3 : index
+# CHECK-NEXT:               %c1_33 = arith.constant 1 : index
+# CHECK-NEXT:               %6 = scf.for %arg13 = %c0_31 to %c3_32 step %c1_33 iter_args(%arg14 = %subview_30) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                 %subview_35 = memref.subview %subview_28[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_36 = memref.subview %subview_29[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_37 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %c0_38 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3_39 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_40 = arith.constant 1 : index
+# CHECK-NEXT:                 %7 = scf.for %arg15 = %c0_38 to %c3_39 step %c1_40 iter_args(%arg16 = %subview_37) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                   %subview_42 = memref.subview %subview_35[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_43 = memref.subview %subview_36[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_44 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_42, %subview_43 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_46: f32, %out: f32):
+# CHECK-NEXT:                     %8 = arith.mulf %in, %in_46 : f32
+# CHECK-NEXT:                     %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                     linalg.yield %9 : f32
 # CHECK-NEXT:                   }
+# CHECK-NEXT:                   %subview_45 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                   memref.copy %subview_44, %subview_45 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                   scf.yield %arg16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %subview_41 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %7, %subview_41 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %subview_34 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %6, %subview_34 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:               scf.yield %arg12 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %subview_27 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %5, %subview_27 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %subview_21 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %4, %subview_21 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %subview_15 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_15 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_8 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x8x8x16xf32>
 # CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
+# CHECK-NEXT:     memref.copy %1, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
@@ -230,4 +470,3 @@
 # CHECK-NEXT:   - %2: conv2d(%0, %1, stride=(1, 1)) {name = 'O'} : [1x10x10x3xfloat32, 3x3x3x16xfloat32] -> [1x8x8x16xfloat32]
 # CHECK-NEXT:  
 # CHECK-NEXT: CODE: 0
-
diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py
new file mode 100644
index 000000000..e9e9a91c1
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py
@@ -0,0 +1,723 @@
+# RUN: python %s 2>&1 | filecheck %s
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+from xtc.artifacts import get_operation
+from xtc.artifacts import get_operation
+
+op = get_operation("conv2d", "ResNet18_01")
+N, H, W, F, R, S, C = [op["dims"][k] for k in ["n", "h", "w", "f", "r", "s", "c"]]
+SH, SW = [op["params"][k] for k in ["SH", "SW"]]
+dtype = "float32"
+
+a = O.tensor((N, H + R - 1, W + S - 1, C), dtype)
+b = O.tensor((R, S, C, F), dtype)
+
+with O.graph(name="conv2d_nhwc_r181") as gb:
+    O.conv2d(a, b, stride=(SH, SW), name="O")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sch.tile("w", {"w1": 4})
+sch.tile("f", {"f1": 16})
+sch.interchange(["b", "h", "w", "f", "r", "s", "c", "w1", "f1"])
+sch.vectorize(["f1"])
+sch.unroll({"w1": 4, "c": 3})
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="conv2d_nhwc_r181_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x112x112x64xf32>) -> tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%1 : tensor<1x112x112x64xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
+# CHECK-NEXT:       %3 = arith.mulf %in, %in_0 : f32
+# CHECK-NEXT:       %4 = arith.addf %out, %3 : f32
+# CHECK-NEXT:       linalg.yield %4 : f32
+# CHECK-NEXT:     } -> tensor<1x112x112x64xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./f" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_19 "./c" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_21 "./w1" : !transform.any_op
+# CHECK-NEXT:     transform.include @_vecto failures(suppress) (%tiled_linalg_op_20) : (!transform.any_op) -> ()
+# CHECK-NEXT:     transform.loop.unroll %loops_21 {factor = 4 : i64} : !transform.any_op
+# CHECK-NEXT:     transform.loop.unroll %loops_19 {factor = 3 : i64} : !transform.any_op
+# CHECK-NEXT:     %2 = transform.get_parent_op %loops_7 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.reduction_to_contract
+# CHECK-NEXT:       transform.apply_patterns.vector.transfer_permutation_patterns
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_outerproduct
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_contraction
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c6 = arith.constant 6 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c7 = arith.constant 7 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c64 = arith.constant 64 : index
+# CHECK-NEXT:     %c112 = arith.constant 112 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32>
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32>
+# CHECK-NEXT:       %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32>
+# CHECK-NEXT:         %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %6 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32>
+# CHECK-NEXT:           %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32>
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32>
+# CHECK-NEXT:             %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:             %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:               %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32>
+# CHECK-NEXT:               %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32>
+# CHECK-NEXT:               %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:                 %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32>
+# CHECK-NEXT:                 %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32>
+# CHECK-NEXT:                 %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               scf.yield %9 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c6 = arith.constant 6 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c7 = arith.constant 7 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c64 = arith.constant 64 : index
+# CHECK-NEXT:     %c112 = arith.constant 112 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32>
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32>
+# CHECK-NEXT:       %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32>
+# CHECK-NEXT:         %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %6 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32>
+# CHECK-NEXT:           %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32>
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32>
+# CHECK-NEXT:             %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:             %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:               %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32>
+# CHECK-NEXT:               %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32>
+# CHECK-NEXT:               %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:                 %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32>
+# CHECK-NEXT:                 %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32>
+# CHECK-NEXT:                 %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               scf.yield %9 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: memref<1x230x230x3xf32> {llvm.noalias}, %arg1: memref<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c6 = arith.constant 6 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c7 = arith.constant 7 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c64 = arith.constant 64 : index
+# CHECK-NEXT:     %c112 = arith.constant 112 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_3 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           %4 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %subview_3) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_5 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>)
+# CHECK-NEXT:             %subview_6 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %subview_4 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %4, %subview_4 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_0 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:         %3 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %subview_2 = memref.subview %subview[0, %3, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %subview_3) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:           %5 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %subview_5 = memref.subview %subview_2[0, 0, %5, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           %6 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %subview_6) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_8 = memref.subview %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:             %subview_9 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %subview_9) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_11 = memref.subview %subview_5[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_12 = memref.subview %subview_8[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %8 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:                 %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_14 = memref.subview %subview_12[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_15 = memref.subview %subview_13[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_16 = memref.subview %subview_14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_17 = memref.subview %subview_15[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_18 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_17, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_18 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_19 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_18, %subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_20 = memref.subview %subview_15[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_21 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_20, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_21 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_22 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_21, %subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_23 = memref.subview %subview_15[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_24 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_23, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_24 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_25 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_24, %subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_26 = memref.subview %subview_15[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_27 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_26, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_27 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_28 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_27, %subview_28 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_29 = memref.subview %subview_13[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_30 = memref.subview %subview_14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_31 = memref.subview %subview_29[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_32 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_31, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_32 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_33 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_32, %subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_34 = memref.subview %subview_29[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_35 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_34, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_35 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_36 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_35, %subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_37 = memref.subview %subview_29[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_38 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_39 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_38, %subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_40 = memref.subview %subview_29[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_41 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_41 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_42 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_41, %subview_42 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_43 = memref.subview %subview_13[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_44 = memref.subview %subview_14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_45 = memref.subview %subview_43[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_46 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_45, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_46 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_47 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_46, %subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_48 = memref.subview %subview_43[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_49 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_48, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_49 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_50 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_49, %subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_51 = memref.subview %subview_43[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_52 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_51, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_52 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_53 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_52, %subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_54 = memref.subview %subview_43[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_55 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_54, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_55 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
+# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
+# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
+# CHECK-NEXT:                   linalg.yield %10 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 %subview_56 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_55, %subview_56 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               scf.yield %8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %subview_10 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %7, %subview_10 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %subview_7 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %6, %subview_7 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %4, %subview_4 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     memref.copy %1, %arg2 : memref<1x112x112x64xf32> to memref<1x112x112x64xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: conv2d_nhwc_r181
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 1x230x230x3xfloat32
+# CHECK-NEXT:   - %1 : 7x7x3x64xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %2 : 1x112x112x64xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: conv2d(%0, %1, stride=(2, 2)) {name = 'O'} : [1x230x230x3xfloat32, 7x7x3x64xfloat32] -> [1x112x112x64xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
index 5a7ed668e..b240b6bbd 100644
--- a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
@@ -30,8 +30,9 @@
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
-# CHECK-NEXT: module {
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
@@ -40,28 +41,6 @@
 # CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: module {
-# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>)
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump Before transform //----- //
-# CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%arg2 : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%arg2 : memref<4x32xf32>)
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
 # CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
 # CHECK-NEXT:     transform.yield 
@@ -85,47 +64,178 @@
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_6 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_12 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_6 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_12 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:       %subview = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       %c0_3 = arith.constant 0 : index
 # CHECK-NEXT:       %c32 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_3 to %c32 step %c1_4 {
-# CHECK-NEXT:         %subview_5 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_5 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %c0_0 = arith.constant 0 : index
 # CHECK-NEXT:     %c4_1 = arith.constant 4 : index
 # CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 {
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %0) -> (memref<4x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
 # CHECK-NEXT:       %subview_3 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_4 = memref.subview %arg2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       %c0_5 = arith.constant 0 : index
 # CHECK-NEXT:       %c32 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_5 to %c32 step %c1_6 {
-# CHECK-NEXT:         %subview_7 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:         %subview_8 = memref.subview %subview_3[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %subview_9 = memref.subview %subview_4[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %subview_4) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_8 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_9 = memref.subview %subview_3[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:         %c512 = arith.constant 512 : index
-# CHECK-NEXT:         %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_10 to %c512 step %c1_11 {
-# CHECK-NEXT:           %subview_12 = memref.subview %subview_7[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:           %subview_13 = memref.subview %subview_8[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_14 = memref.subview %subview_9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_C_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0_11 to %c512 step %c1_12 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_14 = memref.subview %subview_8[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_15 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_16 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_C_} ins(%subview_14, %subview_15 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_16 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_17 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_16, %subview_17 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_13 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_7 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_7 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     memref.copy %1, %arg2 : memref<4x32xf32> to memref<4x32xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
@@ -141,4 +251,3 @@
 # CHECK-NEXT:   - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
 # CHECK-NEXT:  
 # CHECK-NEXT: CODE: 0
-
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
index 037bc5f53..04095b436 100644
--- a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
@@ -18,6 +18,11 @@
 impl = Backend(graph, use_tensor_dialect=True)
 
 sch = impl.get_scheduler(default_node="matmul")
+sch.tile("i", {"i1": 2})
+sch.tile("j", {"j1": 16})
+sch.interchange(["k", "i", "j", "i1", "j1"])
+sch.vectorize(["j1"])
+sch.unroll({"i1": 2})
 sched = sch.schedule()
 
 comp = impl.get_compiler(
@@ -31,17 +36,18 @@
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+
+# CHECK: // -----// IR Dump Before transform //----- //
 # CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
 # CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
-# CHECK-NEXT: module {
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
 # CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
 # CHECK-NEXT:     %3 = tensor.empty() : tensor<4x32xf32>
-# CHECK-NEXT:     %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
+# CHECK-NEXT:     %collapsed = tensor.collapse_shape %2 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
 # CHECK-NEXT:     %4 = tensor.empty() : tensor<128xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %5 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapsed, %cst_0 : tensor<128xf32>, f32) outs(%4 : tensor<128xf32>) attrs =  {__xtc_id_relu_} {
@@ -53,52 +59,6 @@
 # CHECK-NEXT:     bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
-# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
-# CHECK-NEXT: module {
-# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
-# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
-# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs =  {__xtc_id_relu_} {
-# CHECK-NEXT:     ^bb0(%in: f32, %in_2: f32, %out: f32):
-# CHECK-NEXT:       %0 = arith.maximumf %in, %in_2 : f32
-# CHECK-NEXT:       linalg.yield %0 : f32
-# CHECK-NEXT:     }
-# CHECK-NEXT:     %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
-# CHECK-NEXT:     memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump Before transform //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
-# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
-# CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
-# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
-# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapse_shape, %cst_1 : memref<128xf32>, f32) outs(%alloca_0 : memref<128xf32>) attrs =  {__xtc_id_relu_} {
-# CHECK-NEXT:     ^bb0(%in: f32, %in_2: f32, %out: f32):
-# CHECK-NEXT:       %0 = arith.maximumf %in, %in_2 : f32
-# CHECK-NEXT:       linalg.yield %0 : f32
-# CHECK-NEXT:     }
-# CHECK-NEXT:     %expand_shape = memref.expand_shape %alloca_0 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
-# CHECK-NEXT:     memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
 # CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
 # CHECK-NEXT:     transform.yield 
@@ -110,15 +70,28 @@
 # CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
 # CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_7 "./k" : !transform.any_op
-# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_9 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./k" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./i1" : !transform.any_op
+# CHECK-NEXT:     transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> ()
+# CHECK-NEXT:     transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:     %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.reduction_to_contract
+# CHECK-NEXT:       transform.apply_patterns.vector.transfer_permutation_patterns
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_outerproduct
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_contraction
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./i" : !transform.any_op
 # CHECK-NEXT:     transform.yield 
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
@@ -127,63 +100,280 @@
 # CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
 # CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
+# CHECK-NEXT:     %0 = ub.poison : f32
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_4 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) {
+# CHECK-NEXT:           %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %13 = vector.broadcast %12 : f32 to vector<16xf32>
+# CHECK-NEXT:           %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %15 = vector.fma %13, %11, %14 : vector<16xf32>
+# CHECK-NEXT:           %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %23 = vector.broadcast %22 : f32 to vector<16xf32>
+# CHECK-NEXT:           %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %25 = vector.fma %23, %21, %24 : vector<16xf32>
+# CHECK-NEXT:           %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<2x32xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:       } {"./i"}
+# CHECK-NEXT:       scf.yield %6 : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./k"}
+# CHECK-NEXT:     %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<128xf32>
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c128 = arith.constant 128 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
+# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_4: f32, %out: f32):
+# CHECK-NEXT:         %7 = arith.maximumf %in, %in_4 : f32
+# CHECK-NEXT:         linalg.yield %7 : f32
+# CHECK-NEXT:       } -> tensor<1xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<128xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
+# CHECK-NEXT:     %0 = ub.poison : f32
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:       %subview = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_7 = arith.constant 0 : index
-# CHECK-NEXT:       %c32 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_8 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_7 to %c32 step %c1_8 {
-# CHECK-NEXT:         %subview_9 = memref.subview %subview[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_9 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_4 : tensor<1x32xf32>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
-# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) {
+# CHECK-NEXT:           %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %13 = vector.broadcast %12 : f32 to vector<16xf32>
+# CHECK-NEXT:           %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %15 = vector.fma %13, %11, %14 : vector<16xf32>
+# CHECK-NEXT:           %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %23 = vector.broadcast %22 : f32 to vector<16xf32>
+# CHECK-NEXT:           %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %25 = vector.fma %23, %21, %24 : vector<16xf32>
+# CHECK-NEXT:           %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<2x32xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:       } {"./i"}
+# CHECK-NEXT:       scf.yield %6 : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./k"}
+# CHECK-NEXT:     %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<128xf32>
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c128 = arith.constant 128 : index
 # CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 {
-# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:       %subview_7 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_8 = memref.subview %alloca[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
-# CHECK-NEXT:       %c32 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_9 to %c32 step %c1_10 {
-# CHECK-NEXT:         %subview_11 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:         %subview_12 = memref.subview %subview_7[0, %arg4] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %subview_13 = memref.subview %subview_8[0, %arg4] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
-# CHECK-NEXT:         %c512 = arith.constant 512 : index
-# CHECK-NEXT:         %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_14 to %c512 step %c1_15 {
-# CHECK-NEXT:           %subview_16 = memref.subview %subview_11[0, %arg5] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:           %subview_17 = memref.subview %subview_12[%arg5, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_18 = memref.subview %subview_13[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_} ins(%subview_16, %subview_17 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_18 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
+# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_4: f32, %out: f32):
+# CHECK-NEXT:         %7 = arith.maximumf %in, %in_4 : f32
+# CHECK-NEXT:         linalg.yield %7 : f32
+# CHECK-NEXT:       } -> tensor<1xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<128xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
+# CHECK-NEXT:     %0 = ub.poison : f32
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %alloca) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_5 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_5, %subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_4 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %alloca [[0, 1]] : memref<4x32xf32> into memref<128xf32>
-# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
-# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %1) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_4 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (memref<4x32xf32>) {
+# CHECK-NEXT:         %subview_5 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_6) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_8 = memref.subview %subview_4[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_9 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_10 = memref.subview %subview_5[%c0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_11 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %6 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
+# CHECK-NEXT:           %7 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %subview_11[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %9 = vector.extract %7[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.extract %6[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %11 = vector.broadcast %10 : f32 to vector<16xf32>
+# CHECK-NEXT:           %12 = vector.extract %8[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %13 = vector.fma %11, %9, %12 : vector<16xf32>
+# CHECK-NEXT:           %14 = vector.insert %13, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           vector.transfer_write %14, %subview_11[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_12 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_11, %subview_12 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_13 = memref.subview %subview_5[%c1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_14 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %15 = vector.transfer_read %subview_13[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
+# CHECK-NEXT:           %16 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.transfer_read %subview_14[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %18 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %19 = vector.extract %15[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %20 = vector.broadcast %19 : f32 to vector<16xf32>
+# CHECK-NEXT:           %21 = vector.extract %17[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %22 = vector.fma %20, %18, %21 : vector<16xf32>
+# CHECK-NEXT:           %23 = vector.insert %22, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           vector.transfer_write %23, %subview_14[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_15 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_14, %subview_15 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_16 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_9, %subview_16 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %subview_7 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_7 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<4x32xf32>
+# CHECK-NEXT:       } {"./i"}
+# CHECK-NEXT:       scf.yield %4 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./k"}
+# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %2 [[0, 1]] : memref<4x32xf32> into memref<128xf32>
+# CHECK-NEXT:     %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
+# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
 # CHECK-NEXT:     %c128 = arith.constant 128 : index
-# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_5 to %c128 step %c1_6 {
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_2 to %c128 step %c1_3 iter_args(%arg4 = %alloca_1) -> (memref<128xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %collapse_shape[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
-# CHECK-NEXT:       %subview_7 = memref.subview %alloca_3[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
-# CHECK-NEXT:       linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_4 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_7 : memref<1xf32, strided<[1], offset: ?>>) attrs =  {__xtc_id_relu_} {
-# CHECK-NEXT:       ^bb0(%in: f32, %in_8: f32, %out: f32):
-# CHECK-NEXT:         %0 = arith.maximumf %in, %in_8 : f32
-# CHECK-NEXT:         linalg.yield %0 : f32
+# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
+# CHECK-NEXT:       linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_0 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_4 : memref<1xf32, strided<[1], offset: ?>>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_6: f32, %out: f32):
+# CHECK-NEXT:         %4 = arith.maximumf %in, %in_6 : f32
+# CHECK-NEXT:         linalg.yield %4 : f32
 # CHECK-NEXT:       }
+# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
+# CHECK-NEXT:       memref.copy %subview_4, %subview_5 : memref<1xf32, strided<[1], offset: ?>> to memref<1xf32, strided<[1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<128xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %expand_shape = memref.expand_shape %alloca_3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
+# CHECK-NEXT:     %expand_shape = memref.expand_shape %3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
 # CHECK-NEXT:     memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
index 68c2c7617..255c463df 100644
--- a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
@@ -32,11 +32,12 @@
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
+
+# CHECK: // -----// IR Dump Before transform //----- //
 # CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
 # CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
 # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-# CHECK-NEXT: module {
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
@@ -45,7 +46,7 @@
 # CHECK-NEXT:     %2 = tensor.empty() : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %3 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%2 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
-# CHECK-NEXT:     %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%0, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:     %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%inserted_slice, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
 # CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
 # CHECK-NEXT:       %5 = arith.mulf %in, %in_1 : f32
 # CHECK-NEXT:       %6 = arith.addf %out, %5 : f32
@@ -54,54 +55,6 @@
 # CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
-# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-# CHECK-NEXT: module {
-# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>)
-# CHECK-NEXT:     %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
-# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
-# CHECK-NEXT:       %0 = arith.mulf %in, %in_1 : f32
-# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
-# CHECK-NEXT:       linalg.yield %1 : f32
-# CHECK-NEXT:     }
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump Before transform //----- //
-# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
-# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-# CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%alloc : memref<1x12x12x3xf32>)
-# CHECK-NEXT:     %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%arg2 : memref<1x4x4x16xf32>)
-# CHECK-NEXT:     linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%alloc, %arg1 : memref<1x12x12x3xf32>, memref<5x5x3x16xf32>) outs(%arg2 : memref<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
-# CHECK-NEXT:       %0 = arith.mulf %in, %in_1 : f32
-# CHECK-NEXT:       %1 = arith.addf %out, %0 : f32
-# CHECK-NEXT:       linalg.yield %1 : f32
-# CHECK-NEXT:     }
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
 # CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
 # CHECK-NEXT:     transform.yield 
@@ -116,7 +69,7 @@
 # CHECK-NEXT:     transform.annotate %loops_3 "./w" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_5 "./c" : !transform.any_op
-# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_7 "./b" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -124,31 +77,22 @@
 # CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_11 "./w" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_13 "./c" : !transform.any_op
-# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_15 "./b" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_17 "./h" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_19 "./w" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_21 "./f" : !transform.any_op
-# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_23 "./b" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_25 "./h" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_27 "./w" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_28, %loops_29 = transform.structured.tile_using_for %tiled_linalg_op_26 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_29 "./f" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_30, %loops_31 = transform.structured.tile_using_for %tiled_linalg_op_28 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_31 "./r" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_32, %loops_33 = transform.structured.tile_using_for %tiled_linalg_op_30 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_33 "./s" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_34, %loops_35 = transform.structured.tile_using_for %tiled_linalg_op_32 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_35 "./c" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_23 "./r" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_25 "./s" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_27 "./c" : !transform.any_op
 # CHECK-NEXT:     transform.yield 
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
@@ -159,127 +103,494 @@
 # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
 # CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32>
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c12 = arith.constant 12 : index
+# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32>
+# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:         %c12_13 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:           %c3 = arith.constant 3 : index
+# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:           } {"./c"}
+# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_13 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
+# CHECK-NEXT:       %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %6 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_17 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %8 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_24 = arith.constant 1 : index
+# CHECK-NEXT:           %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:             %c5 = arith.constant 5 : index
+# CHECK-NEXT:             %c1_30 = arith.constant 1 : index
+# CHECK-NEXT:             %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
+# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_35 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_36 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_37 = arith.constant 1 : index
+# CHECK-NEXT:               %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_42 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_43 = arith.constant 1 : index
+# CHECK-NEXT:                 %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_49: f32, %out: f32):
+# CHECK-NEXT:                     %14 = arith.mulf %in, %in_49 : f32
+# CHECK-NEXT:                     %15 = arith.addf %out, %14 : f32
+# CHECK-NEXT:                     linalg.yield %15 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32>
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c12 = arith.constant 12 : index
+# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32>
+# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:         %c12_13 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:           %c3 = arith.constant 3 : index
+# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:           } {"./c"}
+# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_13 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
+# CHECK-NEXT:       %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %6 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_17 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %8 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_24 = arith.constant 1 : index
+# CHECK-NEXT:           %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:             %c5 = arith.constant 5 : index
+# CHECK-NEXT:             %c1_30 = arith.constant 1 : index
+# CHECK-NEXT:             %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
+# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_35 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_36 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_37 = arith.constant 1 : index
+# CHECK-NEXT:               %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_42 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_43 = arith.constant 1 : index
+# CHECK-NEXT:                 %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_49: f32, %out: f32):
+# CHECK-NEXT:                     %14 = arith.mulf %in, %in_49 : f32
+# CHECK-NEXT:                     %15 = arith.addf %out, %14 : f32
+# CHECK-NEXT:                     linalg.yield %15 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_0 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0 to %c1 step %c1_0 {
-# CHECK-NEXT:       %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) {
+# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:       %c0_9 = arith.constant 0 : index
 # CHECK-NEXT:       %c12 = arith.constant 12 : index
 # CHECK-NEXT:       %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_9 to %c12 step %c1_10 {
-# CHECK-NEXT:         %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:         %c12_13 = arith.constant 12 : index
-# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_12 to %c12_13 step %c1_14 {
-# CHECK-NEXT:           %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_9 to %c12 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:         %c12_14 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_13 to %c12_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
 # CHECK-NEXT:           %c3 = arith.constant 3 : index
-# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:           scf.for %arg6 = %c0_16 to %c3 step %c1_17 {
-# CHECK-NEXT:             %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:             linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c3 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:             %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:           } {"./c"}
+# CHECK-NEXT:           %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %5, %subview_20 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %4, %subview_16 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_11 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x12x12x3xf32>
 # CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     %subview = memref.subview %alloc[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
+# CHECK-NEXT:     %subview = memref.subview %0[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0_2 = arith.constant 0 : index
 # CHECK-NEXT:     %c1_3 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 {
-# CHECK-NEXT:       %subview_8 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) {
+# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       %c0_9 = arith.constant 0 : index
 # CHECK-NEXT:       %c4 = arith.constant 4 : index
 # CHECK-NEXT:       %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_9 to %c4 step %c1_10 {
-# CHECK-NEXT:         %subview_11 = memref.subview %subview_8[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_13 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_12 to %c4_13 step %c1_14 {
-# CHECK-NEXT:           %subview_15 = memref.subview %subview_11[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_9 to %c4 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_14 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_13 to %c4_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:           scf.for %arg6 = %c0_16 to %c16 step %c1_17 {
-# CHECK-NEXT:             %subview_18 = memref.subview %subview_15[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_18 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
+# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
+# CHECK-NEXT:             %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %5, %subview_20 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %4, %subview_16 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_11 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x4x4x16xf32>
 # CHECK-NEXT:     } {"./b"}
 # CHECK-NEXT:     %c0_5 = arith.constant 0 : index
 # CHECK-NEXT:     %c1_6 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 {
-# CHECK-NEXT:       %subview_8 = memref.subview %alloc[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) {
+# CHECK-NEXT:       %subview_8 = memref.subview %0[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:       %subview_10 = memref.subview %arg2[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_10 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:       %c4 = arith.constant 4 : index
 # CHECK-NEXT:       %c1_12 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_11 to %c4 step %c1_12 {
-# CHECK-NEXT:         %0 = affine.apply #map(%arg4)
-# CHECK-NEXT:         %subview_13 = memref.subview %subview_8[0, %0, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         %subview_14 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:         %subview_15 = memref.subview %subview_10[0, %arg4, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_17 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_16 to %c4_17 step %c1_18 {
-# CHECK-NEXT:           %1 = affine.apply #map(%arg5)
-# CHECK-NEXT:           %subview_19 = memref.subview %subview_13[0, 0, %1, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           %subview_20 = memref.subview %subview_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:           %subview_21 = memref.subview %subview_15[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_11 to %c4 step %c1_12 iter_args(%arg6 = %subview_10) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %subview_14 = memref.subview %subview_8[0, %4, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:         %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_18 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg8 = %subview_16) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %6 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %subview_21 = memref.subview %subview_14[0, 0, %6, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           %subview_22 = memref.subview %subview_15[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
+# CHECK-NEXT:           %subview_23 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           %c0_24 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_23 = arith.constant 1 : index
-# CHECK-NEXT:           scf.for %arg6 = %c0_22 to %c16 step %c1_23 {
-# CHECK-NEXT:             %subview_24 = memref.subview %subview_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:             %subview_25 = memref.subview %subview_20[0, 0, 0, %arg6] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:             %subview_26 = memref.subview %subview_21[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             %c0_27 = arith.constant 0 : index
+# CHECK-NEXT:           %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_24 to %c16 step %c1_25 iter_args(%arg10 = %subview_23) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_27 = memref.subview %subview_21[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_28 = memref.subview %subview_22[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %subview_29 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             %c0_30 = arith.constant 0 : index
 # CHECK-NEXT:             %c5 = arith.constant 5 : index
-# CHECK-NEXT:             %c1_28 = arith.constant 1 : index
-# CHECK-NEXT:             scf.for %arg7 = %c0_27 to %c5 step %c1_28 {
-# CHECK-NEXT:               %subview_29 = memref.subview %subview_24[0, %arg7, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:               %subview_30 = memref.subview %subview_25[%arg7, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:               %subview_31 = memref.subview %subview_26[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:               %c0_32 = arith.constant 0 : index
-# CHECK-NEXT:               %c5_33 = arith.constant 5 : index
-# CHECK-NEXT:               %c1_34 = arith.constant 1 : index
-# CHECK-NEXT:               scf.for %arg8 = %c0_32 to %c5_33 step %c1_34 {
-# CHECK-NEXT:                 %subview_35 = memref.subview %subview_29[0, 0, %arg8, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_36 = memref.subview %subview_30[0, %arg8, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_37 = memref.subview %subview_31[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %c0_38 = arith.constant 0 : index
+# CHECK-NEXT:             %c1_31 = arith.constant 1 : index
+# CHECK-NEXT:             %8 = scf.for %arg11 = %c0_30 to %c5 step %c1_31 iter_args(%arg12 = %subview_29) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_33 = memref.subview %subview_27[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_34 = memref.subview %subview_28[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %subview_35 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:               %c0_36 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_37 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_38 = arith.constant 1 : index
+# CHECK-NEXT:               %9 = scf.for %arg13 = %c0_36 to %c5_37 step %c1_38 iter_args(%arg14 = %subview_35) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                 %subview_40 = memref.subview %subview_33[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_41 = memref.subview %subview_34[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_42 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                 %c0_43 = arith.constant 0 : index
 # CHECK-NEXT:                 %c3 = arith.constant 3 : index
-# CHECK-NEXT:                 %c1_39 = arith.constant 1 : index
-# CHECK-NEXT:                 scf.for %arg9 = %c0_38 to %c3 step %c1_39 {
-# CHECK-NEXT:                   %subview_40 = memref.subview %subview_35[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_41 = memref.subview %subview_36[0, 0, %arg9, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_42 = memref.subview %subview_37[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_41 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_42 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                   ^bb0(%in: f32, %in_43: f32, %out: f32):
-# CHECK-NEXT:                     %2 = arith.mulf %in, %in_43 : f32
-# CHECK-NEXT:                     %3 = arith.addf %out, %2 : f32
-# CHECK-NEXT:                     linalg.yield %3 : f32
+# CHECK-NEXT:                 %c1_44 = arith.constant 1 : index
+# CHECK-NEXT:                 %10 = scf.for %arg15 = %c0_43 to %c3 step %c1_44 iter_args(%arg16 = %subview_42) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                   %subview_46 = memref.subview %subview_40[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_47 = memref.subview %subview_41[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                   %subview_48 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_47 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_48 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_50: f32, %out: f32):
+# CHECK-NEXT:                     %11 = arith.mulf %in, %in_50 : f32
+# CHECK-NEXT:                     %12 = arith.addf %out, %11 : f32
+# CHECK-NEXT:                     linalg.yield %12 : f32
 # CHECK-NEXT:                   }
+# CHECK-NEXT:                   %subview_49 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                   memref.copy %subview_48, %subview_49 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                   scf.yield %arg16 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %subview_45 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %10, %subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %subview_39 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %9, %subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:               scf.yield %arg12 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %subview_32 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %8, %subview_32 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %subview_26 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %7, %subview_26 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_20 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %subview_13 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_13 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x4x4x16xf32>
 # CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     memref.copy %arg2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
+# CHECK-NEXT:     memref.copy %2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
index 34c09d96e..b05c8a8d7 100644
--- a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
@@ -31,8 +31,9 @@
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
-# CHECK-NEXT: module {
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
@@ -45,58 +46,12 @@
 # CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst_2 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %5 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%4 : tensor<16x16xf32>) -> tensor<16x16xf32>
-# CHECK-NEXT:     %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%0, %2 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%inserted_slice, %inserted_slice_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32>
 # CHECK-NEXT:     %7 = tensor.empty() : tensor<14x14xf32>
-# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
 # CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: module {
-# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>)
-# CHECK-NEXT:     %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>)
-# CHECK-NEXT:     %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>)
-# CHECK-NEXT:     %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump Before transform //----- //
-# CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%alloca : memref<16x16xf32>)
-# CHECK-NEXT:     %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%alloca_0 : memref<16x16xf32>)
-# CHECK-NEXT:     %subview_2 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%alloca_3 : memref<16x16xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_matmul_padded_} ins(%alloca, %alloca_0 : memref<16x16xf32>, memref<16x16xf32>) outs(%alloca_3 : memref<16x16xf32>)
-# CHECK-NEXT:     %subview_5 = memref.subview %alloca_3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %subview_5, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
 # CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
 # CHECK-NEXT:     transform.yield 
@@ -107,120 +62,332 @@
 # CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
-# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_7 "./i" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_9 "./j" : !transform.any_op
-# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_11 "./i" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_13 "./j" : !transform.any_op
-# CHECK-NEXT:     %4 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %4 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_15 "./i" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_17 "./j" : !transform.any_op
-# CHECK-NEXT:     %5 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %5 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_19 "./i" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_21 "./j" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_23 "./k" : !transform.any_op
-# CHECK-NEXT:     %6 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %6 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_25 "./i" : !transform.any_op
-# CHECK-NEXT:     %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-# CHECK-NEXT:     transform.annotate %loops_27 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./k" : !transform.any_op
 # CHECK-NEXT:     transform.yield 
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_2 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_5 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_7 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_10 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:     %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
+# CHECK-NEXT:       %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_16 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
+# CHECK-NEXT:         %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_23 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_24 = arith.constant 1 : index
+# CHECK-NEXT:         %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_29 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_25 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_18 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %7 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_2 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_5 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_7 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_10 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:     %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
+# CHECK-NEXT:       %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_16 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
+# CHECK-NEXT:         %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_23 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_24 = arith.constant 1 : index
+# CHECK-NEXT:         %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_29 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_25 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice_18 : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %7 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c16 = arith.constant 16 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0 to %c16 step %c1 {
-# CHECK-NEXT:       %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:       %c16_17 = arith.constant 16 : index
 # CHECK-NEXT:       %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 {
-# CHECK-NEXT:         %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview = memref.subview %alloca[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %subview = memref.subview %0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
 # CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
 # CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
 # CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0_2 = arith.constant 0 : index
 # CHECK-NEXT:     %c16_3 = arith.constant 16 : index
 # CHECK-NEXT:     %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 {
-# CHECK-NEXT:       %subview_15 = memref.subview %alloca_0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:       %c16_17 = arith.constant 16 : index
 # CHECK-NEXT:       %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 {
-# CHECK-NEXT:         %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview_5 = memref.subview %alloca_0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %subview_5 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
 # CHECK-NEXT:     memref.copy %arg1, %subview_5 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
 # CHECK-NEXT:     %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
 # CHECK-NEXT:     %cst_7 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:     %c16_9 = arith.constant 16 : index
 # CHECK-NEXT:     %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 {
-# CHECK-NEXT:       %subview_15 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %alloca_6) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:       %c16_17 = arith.constant 16 : index
 # CHECK-NEXT:       %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_16 to %c16_17 step %c1_18 {
-# CHECK-NEXT:         %subview_19 = memref.subview %subview_15[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_19 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:     %c16_12 = arith.constant 16 : index
 # CHECK-NEXT:     %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 {
-# CHECK-NEXT:       %subview_15 = memref.subview %alloca[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %subview_16 = memref.subview %alloca_0[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
-# CHECK-NEXT:       %subview_17 = memref.subview %alloca_6[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 iter_args(%arg4 = %2) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_15 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_16 = memref.subview %1[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
+# CHECK-NEXT:       %subview_17 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       %c0_18 = arith.constant 0 : index
 # CHECK-NEXT:       %c16_19 = arith.constant 16 : index
 # CHECK-NEXT:       %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg4 = %c0_18 to %c16_19 step %c1_20 {
-# CHECK-NEXT:         %subview_21 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %subview_22 = memref.subview %subview_16[0, %arg4] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %subview_23 = memref.subview %subview_17[0, %arg4] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:         %c16_25 = arith.constant 16 : index
-# CHECK-NEXT:         %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg5 = %c0_24 to %c16_25 step %c1_26 {
-# CHECK-NEXT:           %subview_27 = memref.subview %subview_21[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           %subview_28 = memref.subview %subview_22[%arg5, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           %subview_29 = memref.subview %subview_23[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_27, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_29 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_18 to %c16_19 step %c1_20 iter_args(%arg6 = %subview_17) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_22 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_23 = memref.subview %subview_16[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_24 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %c0_25 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_26 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_27 = arith.constant 1 : index
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0_25 to %c16_26 step %c1_27 iter_args(%arg8 = %subview_24) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_29 = memref.subview %subview_22[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_30 = memref.subview %subview_23[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_31 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_29, %subview_30 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_31 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_32 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_31, %subview_32 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_28 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_21 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_21 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview_14 = memref.subview %alloca_6[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %subview_14 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
 # CHECK-NEXT:     memref.copy %subview_14, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
index c748923b8..7a9eb3442 100644
--- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
@@ -32,8 +32,9 @@
 executor = module.get_executor(validate=True)
 res = executor.execute()
 print(f"CODE: {res}")
-# CHECK: // -----// IR Dump Before Tensor Lowering //----- //
-# CHECK-NEXT: module {
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
@@ -42,40 +43,10 @@
 # CHECK-NEXT:     %3 = tensor.empty() : tensor<32x32xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %4 = linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%3 : tensor<32x32xf32>) -> tensor<32x32xf32>
-# CHECK-NEXT:     %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %0 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32>
+# CHECK-NEXT:     %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %2 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32>
 # CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: module {
-# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
-# CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
-# CHECK-NEXT: }
-# CHECK-NEXT:  
-# CHECK-NEXT: // -----// IR Dump Before transform //----- //
-# CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : memref<4x512xf32>, memref<512x32xf32>) outs(%alloca : memref<4x32xf32>)
-# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%arg3 : memref<32x32xf32>)
-# CHECK-NEXT:     linalg.matmul {__xtc_id_E_} ins(%arg2, %alloca : memref<32x4xf32>, memref<4x32xf32>) outs(%arg3 : memref<32x32xf32>)
-# CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
-# CHECK-NEXT:     return
-# CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
 # CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
 # CHECK-NEXT:     transform.yield 
@@ -111,87 +82,333 @@
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
-# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg4 = %c0 to %c4 step %c1 {
-# CHECK-NEXT:       %subview = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:     %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %c0_9 = arith.constant 0 : index
 # CHECK-NEXT:       %c32_10 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 {
-# CHECK-NEXT:         %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %c0_0 = arith.constant 0 : index
 # CHECK-NEXT:     %c4_1 = arith.constant 4 : index
 # CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 {
-# CHECK-NEXT:       %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_10 = memref.subview %alloca[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:     %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:       %c32_12 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 {
-# CHECK-NEXT:         %subview_14 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:         %subview_15 = memref.subview %subview_9[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
 # CHECK-NEXT:         %c0_17 = arith.constant 0 : index
 # CHECK-NEXT:         %c512 = arith.constant 512 : index
 # CHECK-NEXT:         %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg6 = %c0_17 to %c512 step %c1_18 {
-# CHECK-NEXT:           %subview_19 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:           %subview_20 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_21 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_D_} ins(%subview_19, %subview_20 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_23 : tensor<1x1xf32>
 # CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x32xf32>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<32x32xf32>
 # CHECK-NEXT:     %cst_3 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0_4 = arith.constant 0 : index
 # CHECK-NEXT:     %c32 = arith.constant 32 : index
 # CHECK-NEXT:     %c1_5 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg4 = %c0_4 to %c32 step %c1_5 {
-# CHECK-NEXT:       %subview = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:     %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %c0_9 = arith.constant 0 : index
 # CHECK-NEXT:       %c32_10 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg5 = %c0_9 to %c32_10 step %c1_11 {
-# CHECK-NEXT:         %subview_12 = memref.subview %subview[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_12 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %c0_6 = arith.constant 0 : index
 # CHECK-NEXT:     %c32_7 = arith.constant 32 : index
 # CHECK-NEXT:     %c1_8 = arith.constant 1 : index
-# CHECK-NEXT:     scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 {
-# CHECK-NEXT:       %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>>
-# CHECK-NEXT:       %subview_9 = memref.subview %alloca[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_10 = memref.subview %arg3[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:     %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %c0_11 = arith.constant 0 : index
 # CHECK-NEXT:       %c32_12 = arith.constant 32 : index
 # CHECK-NEXT:       %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:       scf.for %arg5 = %c0_11 to %c32_12 step %c1_13 {
-# CHECK-NEXT:         %subview_14 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>>
-# CHECK-NEXT:         %subview_15 = memref.subview %subview_9[0, %arg5] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %subview_16 = memref.subview %subview_10[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
 # CHECK-NEXT:         %c0_17 = arith.constant 0 : index
 # CHECK-NEXT:         %c4_18 = arith.constant 4 : index
 # CHECK-NEXT:         %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:         scf.for %arg6 = %c0_17 to %c4_18 step %c1_19 {
-# CHECK-NEXT:           %subview_20 = memref.subview %subview_14[0, %arg6] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>>
-# CHECK-NEXT:           %subview_21 = memref.subview %subview_15[%arg6, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_22 = memref.subview %subview_16[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_E_} ins(%subview_20, %subview_21 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_22 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_20 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_23 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<32x32xf32>
+# CHECK-NEXT:     %cst_3 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:     %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:     %c32_7 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_18 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_20 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %0 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %alloca) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %0) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:       %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_15 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_16 = memref.subview %subview_9[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %c0_18 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:         %5 = scf.for %arg8 = %c0_18 to %c512 step %c1_19 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_21 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_22 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_23 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_D_} ins(%subview_21, %subview_22 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_23 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_23, %subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_20 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_20 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %cst_3 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:     %c32_7 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:     %3 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %2) -> (memref<32x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:       %subview_9 = memref.subview %1[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
+# CHECK-NEXT:       %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_15 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:         %subview_16 = memref.subview %subview_9[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %c0_18 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_19 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_20 = arith.constant 1 : index
+# CHECK-NEXT:         %5 = scf.for %arg8 = %c0_18 to %c4_19 step %c1_20 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_22 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:           %subview_23 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_E_} ins(%subview_22, %subview_23 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_25 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_24, %subview_25 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_21 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<32x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     memref.copy %arg3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
+# CHECK-NEXT:     memref.copy %3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }

From e9b27d0c0e198ae7fad05bf492a1579b51fe7692 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liam.semeria@inria.fr>
Date: Fri, 13 Feb 2026 15:04:07 +0100
Subject: [PATCH 11/14] tensor-dialect: multi-output graphs and nodes support

---
 src/xtc/backends/mlir/MlirGraphBackend.py | 30 +++++++++++------------
 src/xtc/backends/mlir/MlirOps.py          | 10 ++++----
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index 0c05ebd16..ab973fb68 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -94,13 +94,12 @@ def _xdsl_generate_node(
             variables[name] = alloca.results[0]
         args = [variables[name] for name in names]
         _, attrs = operation.generate(block=block, args=args)
-        last_node = attrs["nodes_map"].get("return_node_id")
         # the tensor dialect needs the result of the op, not the alloca
         if self.xdsl_type == TensorType:
-            # for name in node.outputs:
-            assert len(node.outputs) == 1
-            variables[node.outputs[0]] = last_node.results[0]
-        return attrs, last_node
+            assert len(node.outputs) == len(attrs["output_nodes"])
+            for name, output in zip(node.outputs, attrs["output_nodes"]):
+                variables[name] = output.results[0]
+        return attrs
 
     def _init_from_graph(
         self,
@@ -129,22 +128,21 @@ def _init_from_graph(
             for name, arg in zip([*graph.inputs, *graph.outputs], inlined_block.args)
         }
         block_attrs = []
-        last_node = None
 
         for node in graph.nodes.values():
-            node_attrs, last_node = self._xdsl_generate_node(
-                node, inlined_block, variables
-            )
+            node_attrs = self._xdsl_generate_node(node, inlined_block, variables)
             block_attrs.append(node_attrs)
         with ImplicitBuilder(inlined_block):
             if self.xdsl_type == TensorType:
-                assert last_node
-                # write the final tensor value to the output buffer
-                dest = bufferization.MaterializeInDestinationOp(
-                    operands=((last_node.results[0],), (inlined_block.args[-1],)),
-                    result_types=((),),
-                    attributes={"writable": UnitAttr(), "restrict": UnitAttr()},
-                )
+                # write the final tensor values to the output buffers
+                for name, out_arg in zip(
+                    graph.outputs, inlined_block.args[-len(graph.outputs) :]
+                ):
+                    bufferization.MaterializeInDestinationOp(
+                        operands=((variables[name],), (out_arg,)),
+                        result_types=((),),
+                        attributes={"writable": UnitAttr(), "restrict": UnitAttr()},
+                    )
             func.ReturnOp()
         region = Region([inlined_block])  # type: ignore # issue with mypy
         payload = xdslFuncOp.from_region(
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index df8a4aba0..c7b0eb411 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -192,12 +192,12 @@ def generate_op(
             "nodes_map": {
                 fill_node_id: fill,
                 reduce_node_id: reduce,
-                "return_node_id": reduce,
             },
             "dims_sizes": [
                 {"i": Ki, "j": Kj},
                 self.dims_sizes(),
             ],
+            "output_nodes": [reduce],
         }
         return block, attrs
 
@@ -341,12 +341,12 @@ def generate_op(
             "nodes_map": {
                 fill_node_id: fill,
                 reduce_node_id: reduce,
-                "return_node_id": reduce,
             },
             "dims_sizes": [
                 {"b": Kb, "h": Kh, "w": Kw, "f": Kf},
                 self.dims_sizes(),
             ],
+            "output_nodes": [reduce],
         }
         return block, attrs
 
@@ -497,11 +497,11 @@ def generate_op(
         attrs = {
             "nodes_map": {
                 relu_node_id: relu,
-                "return_node_id": relu_result,
             },
             "dims_sizes": [
                 self.dims_sizes(),
             ],
+            "output_nodes": [relu_result],
         }
         return block, attrs
 
@@ -613,12 +613,12 @@ def generate_op(
             "nodes_map": {
                 fill_node_id: fill,
                 copy_node_id: None if using_tensors else copy,
-                "return_node_id": copy,
             },
             "dims_sizes": [
                 self.dims_sizes(),
                 *([] if using_tensors else [self.dims_sizes()]),
             ],
+            "output_nodes": [copy],
         }
         return block, attrs
 
@@ -728,9 +728,9 @@ def generate_op(
         attrs = {
             "nodes_map": {
                 copy_node_id: None if using_tensors else copy,
-                "return_node_id": copy,
             },
             "dims_sizes": [*([] if using_tensors else [self.dims_sizes()])],
+            "output_nodes": [copy],
         }
         return block, attrs
 

From b06b0f2abbf92145f4f6a13984291d1e84523402 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liamsemeria@gmail.com>
Date: Tue, 24 Feb 2026 11:32:14 +0100
Subject: [PATCH 12/14] tensor-dialect: changed to non-collapse relu, added
 fusion cleanup passes

---
 src/xtc/backends/mlir/MlirCompilerPasses.py   |   2 +
 src/xtc/backends/mlir/MlirOps.py              |  81 ++--
 .../test_conv2d_mini_mlir_tensor.py           | 180 +++------
 .../test_conv2d_r181_mlir_tensor.py           | 369 +++++++++---------
 .../tensor_dialect/test_matmul_mlir_tensor.py |  65 ++-
 .../test_matmul_relu_mlir_tensor.py           | 294 +++++++-------
 .../test_pad_conv2d_mlir_tensor.py            | 244 +++++-------
 .../test_pad_matmul_unpad_mlir_tensor.py      | 133 +++----
 .../test_two_matmuls_mlir_tensor.py           | 130 +++---
 9 files changed, 622 insertions(+), 876 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index a52e6aa21..804adce6b 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -563,6 +563,8 @@ def apply_bufferization_passes(mlir_program: RawMlirProgram):
         bufferize_options.append("buffer-alignment=256")
     apply_passes.run(
         [
+            "canonicalize",
+            "cse",
             "eliminate-empty-tensors",  # causes ops to write directly to out buffer
             f"one-shot-bufferize{{{' '.join(bufferize_options)}}}",
             "func.func(buffer-hoisting)",
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index c7b0eb411..836f7d2e7 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -421,77 +421,58 @@ def generate_op(
                 ]
             )
             if self.op_type == TensorType:
-                inp = tensor.CollapseShapeOp(  # type: ignore
-                    operands=[args[0]],
-                    properties=dict(reassociation=inp_reassociation),
-                    result_types=[self.op_type(elt_type, (inp_size,))],
-                )
-                # create empty tensor for collapsed output shape
-                out_empty = tensor.EmptyOp([], TensorType(elt_type, [out_size]))
-                out_operand = out_empty.tensor
+                out_operand = args[1]
+                inp_operand = args[0]
+                rank = len(out_shape)
+                iterator_types = [StringAttr("parallel")] * rank
+                indexing_maps = [
+                    AffineMapAttr(AffineMap.identity(rank)),  # input
+                    AffineMapAttr(
+                        AffineMap.identity(rank).drop_results(out_shape)
+                    ),  # scalar
+                    AffineMapAttr(AffineMap.identity(rank)),  # output
+                ]
             else:
                 inp = memref.CollapseShapeOp(  # type: ignore
                     operands=[args[0]],
                     properties=dict(reassociation=inp_reassociation),
                     result_types=[self.op_type(elt_type, (inp_size,))],
                 )
+                inp_operand = inp.results[0]  # type: ignore
                 out = memref.CollapseShapeOp(
                     operands=[args[1]],
                     properties=dict(reassociation=out_reassociation),
                     result_types=[self.op_type(elt_type, (out_size,))],
                 )
-                out_operand = out.results[0]
-
-            result = (
-                (TensorType(elt_type, [out_size]),)
-                if self.op_type == TensorType
-                else ()
-            )
+                out_operand = out.results[0]  # type: ignore
+                iterator_types = [
+                    StringAttr({"P": "parallel", "R": "reduction"}[k])
+                    for k in self.KINDS
+                ]
+                # ignore typing due to xdsl hints limitation
+                indexing_maps = [
+                    AffineMapAttr(AffineMap.from_callable(lambda i: (i,))),  # type: ignore
+                    AffineMapAttr(AffineMap.from_callable(lambda _: ())),  # type: ignore
+                    AffineMapAttr(AffineMap.from_callable(lambda i: (i,))),  # type: ignore
+                ]
+                iterator_types = [
+                    StringAttr({"P": "parallel", "R": "reduction"}[k])
+                    for k in self.KINDS
+                ]
+            result = (args[1].type,) if self.op_type == TensorType else ()
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
-            iterator_types = [
-                StringAttr({"P": "parallel", "R": "reduction"}[k]) for k in self.KINDS
-            ]
             block_in = Block(arg_types=[f32, f32, f32])
             with ImplicitBuilder(block_in):
                 max = arith.MaximumfOp(block_in.args[0], block_in.args[1])
                 linalg.YieldOp(max)
             relu = linalg.GenericOp(
-                inputs=(inp.results[0], cst0.results[0]),
+                inputs=(inp_operand, cst0.results[0]),
                 outputs=(out_operand,),
                 body=Region([block_in]),  # type: ignore # mypy issue with dataclass
-                # ignore typing due to xdsl hints limitation
-                indexing_maps=[
-                    AffineMapAttr(
-                        AffineMap.from_callable(
-                            lambda i:  # type: ignore
-                            (i,)
-                        )
-                    ),
-                    AffineMapAttr(
-                        AffineMap.from_callable(
-                            lambda _:  # type: ignore
-                            ()
-                        )
-                    ),
-                    AffineMapAttr(
-                        AffineMap.from_callable(
-                            lambda i:  # type: ignore
-                            (i,)
-                        )
-                    ),
-                ],
+                indexing_maps=indexing_maps,
                 iterator_types=iterator_types,
                 result_types=result,
             )
-            relu_result = None
-            if self.op_type == TensorType:
-                relu_result = tensor.ExpandShapeOp(
-                    relu.results[0],
-                    reassociation=out_reassociation,
-                    result_type=TensorType(elt_type, out_shape),
-                    static_output_shape=out_shape,
-                    dynamic_output_shape=[],
-                )
         relu_node_id = f"{self.name}"
         relu.attributes[f"__xtc_id_{relu_node_id}_"] = UnitAttr()
         attrs = {
@@ -501,7 +482,7 @@ def generate_op(
             "dims_sizes": [
                 self.dims_sizes(),
             ],
-            "output_nodes": [relu_result],
+            "output_nodes": [relu],
         }
         return block, attrs
 
diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
index bd8db60bd..a6791aaad 100644
--- a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
@@ -333,127 +333,73 @@
 # CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c8 = arith.constant 8 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
-# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) {
-# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:       %c8 = arith.constant 8 : index
-# CHECK-NEXT:       %c1_5 = arith.constant 1 : index
-# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %subview) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:         %c8_9 = arith.constant 8 : index
-# CHECK-NEXT:         %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:         %3 = scf.for %arg7 = %c0_8 to %c8_9 step %c1_10 iter_args(%arg8 = %subview_7) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_12 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:           %4 = scf.for %arg9 = %c0_13 to %c16 step %c1_14 iter_args(%arg10 = %subview_12) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:             %subview_16 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>)
-# CHECK-NEXT:             %subview_17 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             memref.copy %subview_16, %subview_17 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %subview_15 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %4, %subview_15 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %3, %subview_11 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %2, %subview_6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_3 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_3 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_3, %subview_4 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_2 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_0 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x8x8x16xf32>
-# CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) {
-# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:       %subview_4 = memref.subview %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_6 = arith.constant 0 : index
-# CHECK-NEXT:       %c8 = arith.constant 8 : index
-# CHECK-NEXT:       %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %subview_5) -> (memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_9 = memref.subview %subview[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:         %subview_10 = memref.subview %subview_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:         %subview_11 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:         %c8_13 = arith.constant 8 : index
-# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:         %3 = scf.for %arg7 = %c0_12 to %c8_13 step %c1_14 iter_args(%arg8 = %subview_11) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_16 = memref.subview %subview_9[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:           %subview_17 = memref.subview %subview_10[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>>
-# CHECK-NEXT:           %subview_18 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
-# CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:           %4 = scf.for %arg9 = %c0_19 to %c16 step %c1_20 iter_args(%arg10 = %subview_18) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:             %subview_22 = memref.subview %subview_16[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:             %subview_23 = memref.subview %subview_17[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32, strided<[144, 48, 16, 1]>> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:             %subview_24 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:             %c3 = arith.constant 3 : index
-# CHECK-NEXT:             %c1_26 = arith.constant 1 : index
-# CHECK-NEXT:             %5 = scf.for %arg11 = %c0_25 to %c3 step %c1_26 iter_args(%arg12 = %subview_24) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:               %subview_28 = memref.subview %subview_22[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:               %subview_29 = memref.subview %subview_23[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:               %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:               %c0_31 = arith.constant 0 : index
-# CHECK-NEXT:               %c3_32 = arith.constant 3 : index
-# CHECK-NEXT:               %c1_33 = arith.constant 1 : index
-# CHECK-NEXT:               %6 = scf.for %arg13 = %c0_31 to %c3_32 step %c1_33 iter_args(%arg14 = %subview_30) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:                 %subview_35 = memref.subview %subview_28[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_36 = memref.subview %subview_29[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_37 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %c0_38 = arith.constant 0 : index
-# CHECK-NEXT:                 %c3_39 = arith.constant 3 : index
-# CHECK-NEXT:                 %c1_40 = arith.constant 1 : index
-# CHECK-NEXT:                 %7 = scf.for %arg15 = %c0_38 to %c3_39 step %c1_40 iter_args(%arg16 = %subview_37) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
-# CHECK-NEXT:                   %subview_42 = memref.subview %subview_35[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_43 = memref.subview %subview_36[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_44 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_42, %subview_43 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                   ^bb0(%in: f32, %in_46: f32, %out: f32):
-# CHECK-NEXT:                     %8 = arith.mulf %in, %in_46 : f32
-# CHECK-NEXT:                     %9 = arith.addf %out, %8 : f32
-# CHECK-NEXT:                     linalg.yield %9 : f32
-# CHECK-NEXT:                   }
-# CHECK-NEXT:                   %subview_45 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                   memref.copy %subview_44, %subview_45 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                   scf.yield %arg16 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                 } {"./c"}
-# CHECK-NEXT:                 %subview_41 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %7, %subview_41 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[0, %arg3, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %subview[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %arg1[0, 0, 0, %arg7] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           %4 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %subview_6) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_8 = memref.subview %subview_2[0, %arg9, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_9 = memref.subview %subview_5[%arg9, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %5 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_10 = memref.subview %subview_8[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_11 = memref.subview %subview_9[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %6 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                 %subview_12 = memref.subview %subview_10[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_12, %subview_13 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_14: f32, %out: f32):
+# CHECK-NEXT:                   %7 = arith.mulf %in, %in_14 : f32
+# CHECK-NEXT:                   %8 = arith.addf %out, %7 : f32
+# CHECK-NEXT:                   linalg.yield %8 : f32
+# CHECK-NEXT:                 }
 # CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:               } {"./s"}
-# CHECK-NEXT:               %subview_34 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:               memref.copy %6, %subview_34 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:               scf.yield %arg12 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             } {"./r"}
-# CHECK-NEXT:             %subview_27 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             memref.copy %5, %subview_27 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %subview_21 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %4, %subview_21 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %subview_15 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %3, %subview_15 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %2, %subview_8 : memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x8x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:               } {"./c"}
+# CHECK-NEXT:               scf.yield %6 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             } {"./s"}
+# CHECK-NEXT:             scf.yield %5 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           } {"./r"}
+# CHECK-NEXT:           %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %4, %subview_7 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_4 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x8x8x16xf32>
-# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     } {"./h"}
 # CHECK-NEXT:     memref.copy %1, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py
index e9e9a91c1..a363b4e14 100644
--- a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py
@@ -503,9 +503,6 @@
 # CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: memref<1x230x230x3xf32> {llvm.noalias}, %arg1: memref<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %c6 = arith.constant 6 : index
-# CHECK-NEXT:     %c3 = arith.constant 3 : index
-# CHECK-NEXT:     %c2 = arith.constant 2 : index
 # CHECK-NEXT:     %c7 = arith.constant 7 : index
 # CHECK-NEXT:     %c16 = arith.constant 16 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
@@ -514,197 +511,185 @@
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) {
-# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_3 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           %4 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %subview_3) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:             %subview_5 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:             linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>)
-# CHECK-NEXT:             %subview_6 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:             memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %subview_4 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %4, %subview_4 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %2, %subview_0 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) {
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c64 step %c1 iter_args(%arg8 = %subview_2) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_4, %subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_3 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x112x112x64xf32>
-# CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) {
-# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:         %3 = affine.apply #map(%arg5)
-# CHECK-NEXT:         %subview_2 = memref.subview %subview[0, %3, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %subview_3) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:           %5 = affine.apply #map(%arg7)
-# CHECK-NEXT:           %subview_5 = memref.subview %subview_2[0, 0, %5, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           %6 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %subview_6) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:             %subview_8 = memref.subview %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
-# CHECK-NEXT:             %subview_9 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:             %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %subview_9) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:               %subview_11 = memref.subview %subview_5[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:               %subview_12 = memref.subview %subview_8[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
-# CHECK-NEXT:               %8 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
-# CHECK-NEXT:                 %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_14 = memref.subview %subview_12[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_15 = memref.subview %subview_13[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_16 = memref.subview %subview_14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_17 = memref.subview %subview_15[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_18 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_17, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_18 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_19 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_18, %subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_20 = memref.subview %subview_15[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_21 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_20, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_21 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_22 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_21, %subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_23 = memref.subview %subview_15[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_24 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_23, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_24 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_25 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_24, %subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_26 = memref.subview %subview_15[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_27 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_26, %subview_16 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_27 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_28 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_27, %subview_28 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_29 = memref.subview %subview_13[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_30 = memref.subview %subview_14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_31 = memref.subview %subview_29[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_32 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_31, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_32 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_33 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_32, %subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_34 = memref.subview %subview_29[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_35 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_34, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_35 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_36 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_35, %subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_37 = memref.subview %subview_29[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_38 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_37, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_38 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_39 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_38, %subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_40 = memref.subview %subview_29[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_41 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_40, %subview_30 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_41 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_42 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_41, %subview_42 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_43 = memref.subview %subview_13[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_44 = memref.subview %subview_14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_45 = memref.subview %subview_43[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_46 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_45, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_46 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_47 = memref.subview %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_46, %subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_48 = memref.subview %subview_43[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_49 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_48, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_49 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_50 = memref.subview %arg14[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_49, %subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_51 = memref.subview %subview_43[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_52 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_51, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_52 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_53 = memref.subview %arg14[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_52, %subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_54 = memref.subview %subview_43[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_55 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_54, %subview_44 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_55 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
-# CHECK-NEXT:                 ^bb0(%in: f32, %in_57: f32, %out: f32):
-# CHECK-NEXT:                   %9 = arith.mulf %in, %in_57 : f32
-# CHECK-NEXT:                   %10 = arith.addf %out, %9 : f32
-# CHECK-NEXT:                   linalg.yield %10 : f32
-# CHECK-NEXT:                 }
-# CHECK-NEXT:                 %subview_56 = memref.subview %arg14[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %subview_55, %subview_56 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:               } {"./s"}
-# CHECK-NEXT:               scf.yield %8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:             } {"./r"}
-# CHECK-NEXT:             %subview_10 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:             memref.copy %7, %subview_10 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %subview_7 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %6, %subview_7 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %4, %subview_4 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %subview_1 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x112x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     %subview = memref.subview %arg0[0, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) {
+# CHECK-NEXT:       %2 = affine.apply #map(%arg3)
+# CHECK-NEXT:       %subview_0 = memref.subview %subview[0, %2, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c4 iter_args(%arg6 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %subview_3 = memref.subview %subview_0[0, 0, %4, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c64 step %c16 iter_args(%arg8 = %subview_4) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_6 = memref.subview %arg1[0, 0, 0, %arg7] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:           %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           %6 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %subview_7) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_9 = memref.subview %subview_3[0, %arg9, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_10 = memref.subview %subview_6[%arg9, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:             %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_11 = memref.subview %subview_9[0, 0, %arg11, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_12 = memref.subview %subview_10[0, %arg11, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_13 = memref.subview %subview_11[0, 0, 0, 0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_14 = memref.subview %subview_12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_15 = memref.subview %subview_13[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_16 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_15, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_16 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_17 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_16, %subview_17 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_18 = memref.subview %subview_13[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_19 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_18, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_20 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_19, %subview_20 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_21 = memref.subview %subview_13[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_22 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_21, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_23 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_22, %subview_23 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_24 = memref.subview %subview_13[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_25 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_24, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_26 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_25, %subview_26 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_27 = memref.subview %subview_11[0, 0, 0, 1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_28 = memref.subview %subview_12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_29 = memref.subview %subview_27[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_29, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_30 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_31 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_30, %subview_31 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_32 = memref.subview %subview_27[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_33 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_32, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_34 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_33, %subview_34 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_35 = memref.subview %subview_27[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_36 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_35, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_37 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_36, %subview_37 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_38 = memref.subview %subview_27[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_39 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_38, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_40 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_39, %subview_40 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_41 = memref.subview %subview_11[0, 0, 0, 2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_42 = memref.subview %subview_12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_43 = memref.subview %subview_41[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_44 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_45 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_44, %subview_45 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_46 = memref.subview %subview_41[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_47 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_48 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_47, %subview_48 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_49 = memref.subview %subview_41[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_50 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_49, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_51 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_50, %subview_51 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_52 = memref.subview %subview_41[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_53 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_52, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_54 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_53, %subview_54 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               scf.yield %arg12 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             } {"./s"}
+# CHECK-NEXT:             scf.yield %7 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           } {"./r"}
+# CHECK-NEXT:           %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %6, %subview_8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_5 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x112x112x64xf32>
-# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     } {"./h"}
 # CHECK-NEXT:     memref.copy %1, %arg2 : memref<1x112x112x64xf32> to memref<1x112x112x64xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
index b240b6bbd..dd676fa28 100644
--- a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
@@ -181,58 +181,43 @@
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0 = arith.constant 0 : index
-# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_3 = arith.constant 0 : index
-# CHECK-NEXT:       %c32 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %2, %subview_5 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
-# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
-# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %0) -> (memref<4x32xf32>) {
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (memref<4x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:       %subview_3 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_5 = arith.constant 0 : index
-# CHECK-NEXT:       %c32 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:       %2 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %subview_4) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_8 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:         %subview_9 = memref.subview %subview_3[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
-# CHECK-NEXT:         %c512 = arith.constant 512 : index
-# CHECK-NEXT:         %c1_12 = arith.constant 1 : index
-# CHECK-NEXT:         %3 = scf.for %arg7 = %c0_11 to %c512 step %c1_12 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_14 = memref.subview %subview_8[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:           %subview_15 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_16 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_C_} ins(%subview_14, %subview_15 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_16 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:           %subview_17 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_16, %subview_17 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg1[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c512 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %subview[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_2[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_C_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:           scf.yield %arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
-# CHECK-NEXT:         %subview_13 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %3, %subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_7 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %2, %subview_7 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     memref.copy %1, %arg2 : memref<4x32xf32> to memref<4x32xf32>
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
index 04095b436..b83522853 100644
--- a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
@@ -38,8 +38,8 @@
 print(f"CODE: {res}")
 
 # CHECK: // -----// IR Dump Before transform //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
-# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
@@ -47,16 +47,13 @@
 # CHECK-NEXT:     %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
 # CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
 # CHECK-NEXT:     %3 = tensor.empty() : tensor<4x32xf32>
-# CHECK-NEXT:     %collapsed = tensor.collapse_shape %2 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
-# CHECK-NEXT:     %4 = tensor.empty() : tensor<128xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %5 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%collapsed, %cst_0 : tensor<128xf32>, f32) outs(%4 : tensor<128xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:     %4 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %cst_0 : tensor<4x32xf32>, f32) outs(%3 : tensor<4x32xf32>) attrs =  {__xtc_id_relu_} {
 # CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
-# CHECK-NEXT:       %6 = arith.maximumf %in, %in_1 : f32
-# CHECK-NEXT:       linalg.yield %6 : f32
-# CHECK-NEXT:     } -> tensor<128xf32>
-# CHECK-NEXT:     %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32>
-# CHECK-NEXT:     bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:       %5 = arith.maximumf %in, %in_1 : f32
+# CHECK-NEXT:       linalg.yield %5 : f32
+# CHECK-NEXT:     } -> tensor<4x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
@@ -97,8 +94,8 @@
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
-# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
@@ -115,82 +112,80 @@
 # CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
 # CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
-# CHECK-NEXT:         %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_4 : tensor<1x32xf32>
+# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_5 : tensor<1x32xf32>
 # CHECK-NEXT:       } {"./j"}
 # CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
 # CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) {
 # CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32>
-# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) {
-# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
-# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
-# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) {
-# CHECK-NEXT:           %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
-# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
-# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
-# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
-# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) {
+# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
 # CHECK-NEXT:           %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32>
 # CHECK-NEXT:           %13 = vector.broadcast %12 : f32 to vector<16xf32>
 # CHECK-NEXT:           %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %15 = vector.fma %13, %11, %14 : vector<16xf32>
 # CHECK-NEXT:           %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32>
-# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
-# CHECK-NEXT:           %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
-# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
-# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
-# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
 # CHECK-NEXT:           %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32>
 # CHECK-NEXT:           %23 = vector.broadcast %22 : f32 to vector<16xf32>
 # CHECK-NEXT:           %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %25 = vector.fma %23, %21, %24 : vector<16xf32>
 # CHECK-NEXT:           %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32>
-# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
-# CHECK-NEXT:           %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
-# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<2x32xf32>
+# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_15 : tensor<2x32xf32>
 # CHECK-NEXT:         } {"./j"}
 # CHECK-NEXT:         %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32>
 # CHECK-NEXT:         scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:       } {"./i"}
 # CHECK-NEXT:       scf.yield %6 : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./k"}
-# CHECK-NEXT:     %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
-# CHECK-NEXT:     %4 = tensor.empty() : tensor<128xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<4x32xf32>
 # CHECK-NEXT:     %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:     %c128 = arith.constant 128 : index
-# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) {
-# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
-# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
-# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs =  {__xtc_id_relu_} {
-# CHECK-NEXT:       ^bb0(%in: f32, %in_4: f32, %out: f32):
-# CHECK-NEXT:         %7 = arith.maximumf %in, %in_4 : f32
+# CHECK-NEXT:     %c4_2 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_5: f32, %out: f32):
+# CHECK-NEXT:         %7 = arith.maximumf %in, %in_5 : f32
 # CHECK-NEXT:         linalg.yield %7 : f32
-# CHECK-NEXT:       } -> tensor<1xf32>
-# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice : tensor<128xf32>
+# CHECK-NEXT:       } -> tensor<1x32xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32>
-# CHECK-NEXT:     bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
-# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
@@ -207,85 +202,82 @@
 # CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
 # CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
-# CHECK-NEXT:         %extracted_slice_3 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_3 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_4 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_4 : tensor<1x32xf32>
+# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_5 : tensor<1x32xf32>
 # CHECK-NEXT:       } {"./j"}
 # CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
 # CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) {
 # CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32>
-# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
 # CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) {
-# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
-# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
-# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_5) -> (tensor<2x32xf32>) {
-# CHECK-NEXT:           %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
-# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
-# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %extracted_slice_4[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_8[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
-# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
-# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) {
+# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
 # CHECK-NEXT:           %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32>
 # CHECK-NEXT:           %13 = vector.broadcast %12 : f32 to vector<16xf32>
 # CHECK-NEXT:           %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %15 = vector.fma %13, %11, %14 : vector<16xf32>
 # CHECK-NEXT:           %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32>
-# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
-# CHECK-NEXT:           %inserted_slice_10 = tensor.insert_slice %17 into %extracted_slice_7[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
-# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %extracted_slice_4[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_11[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
-# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_6[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
-# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
 # CHECK-NEXT:           %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32>
 # CHECK-NEXT:           %23 = vector.broadcast %22 : f32 to vector<16xf32>
 # CHECK-NEXT:           %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %25 = vector.fma %23, %21, %24 : vector<16xf32>
 # CHECK-NEXT:           %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32>
-# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
-# CHECK-NEXT:           %inserted_slice_13 = tensor.insert_slice %27 into %inserted_slice_10[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
-# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %inserted_slice_13 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<2x32xf32>
+# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_15 : tensor<2x32xf32>
 # CHECK-NEXT:         } {"./j"}
 # CHECK-NEXT:         %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32>
 # CHECK-NEXT:         scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:       } {"./i"}
 # CHECK-NEXT:       scf.yield %6 : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./k"}
-# CHECK-NEXT:     %collapsed = tensor.collapse_shape %3 [[0, 1]] : tensor<4x32xf32> into tensor<128xf32>
-# CHECK-NEXT:     %4 = tensor.empty() : tensor<128xf32>
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<4x32xf32>
 # CHECK-NEXT:     %c0_1 = arith.constant 0 : index
-# CHECK-NEXT:     %c128 = arith.constant 128 : index
-# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c128 step %c1_2 iter_args(%arg4 = %4) -> (tensor<128xf32>) {
-# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %collapsed[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
-# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf32> to tensor<1xf32>
-# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %cst_0 : tensor<1xf32>, f32) outs(%extracted_slice_3 : tensor<1xf32>) attrs =  {__xtc_id_relu_} {
-# CHECK-NEXT:       ^bb0(%in: f32, %in_4: f32, %out: f32):
-# CHECK-NEXT:         %7 = arith.maximumf %in, %in_4 : f32
+# CHECK-NEXT:     %c4_2 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_5: f32, %out: f32):
+# CHECK-NEXT:         %7 = arith.maximumf %in, %in_5 : f32
 # CHECK-NEXT:         linalg.yield %7 : f32
-# CHECK-NEXT:       } -> tensor<1xf32>
-# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3] [1] [1] : tensor<1xf32> into tensor<128xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice : tensor<128xf32>
+# CHECK-NEXT:       } -> tensor<1x32xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %expanded = tensor.expand_shape %5 [[0, 1]] output_shape [4, 32] : tensor<128xf32> into tensor<4x32xf32>
-# CHECK-NEXT:     bufferization.materialize_in_destination %expanded in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0)>
-# CHECK-NEXT: #map1 = affine_map<(d0) -> ()>
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
 # CHECK-NEXT:     %0 = ub.poison : f32
 # CHECK-NEXT:     %c16 = arith.constant 16 : index
 # CHECK-NEXT:     %c2 = arith.constant 2 : index
@@ -294,87 +286,79 @@
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     %c4 = arith.constant 4 : index
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
-# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %alloca) -> (memref<4x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_5 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%subview_5 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_5, %subview_6 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_4 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %1) -> (memref<4x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:       %subview_4 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (memref<4x32xf32>) {
-# CHECK-NEXT:         %subview_5 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:         %subview_6 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_6) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_8 = memref.subview %subview_4[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_9 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_10 = memref.subview %subview_5[%c0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:           %subview_11 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %6 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
-# CHECK-NEXT:           %7 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
-# CHECK-NEXT:           %8 = vector.transfer_read %subview_11[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:         %subview_1 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_2) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_4 = memref.subview %subview_0[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_1[0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_7 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %6 = vector.transfer_read %subview_6[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
+# CHECK-NEXT:           %7 = vector.transfer_read %subview_4[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %subview_7[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
 # CHECK-NEXT:           %9 = vector.extract %7[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %10 = vector.extract %6[0, 0] : f32 from vector<1x1xf32>
 # CHECK-NEXT:           %11 = vector.broadcast %10 : f32 to vector<16xf32>
 # CHECK-NEXT:           %12 = vector.extract %8[0] : vector<16xf32> from vector<1x16xf32>
 # CHECK-NEXT:           %13 = vector.fma %11, %9, %12 : vector<16xf32>
-# CHECK-NEXT:           %14 = vector.insert %13, %cst [0] : vector<16xf32> into vector<1x16xf32>
-# CHECK-NEXT:           vector.transfer_write %14, %subview_11[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_12 = memref.subview %subview_9[%c0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_11, %subview_12 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_13 = memref.subview %subview_5[%c1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:           %subview_14 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %15 = vector.transfer_read %subview_13[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
-# CHECK-NEXT:           %16 = vector.transfer_read %subview_8[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
-# CHECK-NEXT:           %17 = vector.transfer_read %subview_14[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
-# CHECK-NEXT:           %18 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32>
-# CHECK-NEXT:           %19 = vector.extract %15[0, 0] : f32 from vector<1x1xf32>
-# CHECK-NEXT:           %20 = vector.broadcast %19 : f32 to vector<16xf32>
-# CHECK-NEXT:           %21 = vector.extract %17[0] : vector<16xf32> from vector<1x16xf32>
-# CHECK-NEXT:           %22 = vector.fma %20, %18, %21 : vector<16xf32>
-# CHECK-NEXT:           %23 = vector.insert %22, %cst [0] : vector<16xf32> into vector<1x16xf32>
-# CHECK-NEXT:           vector.transfer_write %23, %subview_14[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_15 = memref.subview %subview_9[%c1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_14, %subview_15 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_16 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_9, %subview_16 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %14 = vector.broadcast %13 : vector<16xf32> to vector<1x16xf32>
+# CHECK-NEXT:           vector.transfer_write %14, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_8 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_7, %subview_8 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_9 = memref.subview %subview_1[1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_10 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %15 = vector.transfer_read %subview_9[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
+# CHECK-NEXT:           %16 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.extract %15[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %18 = vector.broadcast %17 : f32 to vector<16xf32>
+# CHECK-NEXT:           %19 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.fma %18, %9, %19 : vector<16xf32>
+# CHECK-NEXT:           %21 = vector.broadcast %20 : vector<16xf32> to vector<1x16xf32>
+# CHECK-NEXT:           vector.transfer_write %21, %subview_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_11 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_10, %subview_11 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_12 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_5, %subview_12 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:           scf.yield %arg8 : memref<2x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         } {"./j"}
-# CHECK-NEXT:         %subview_7 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %5, %subview_7 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_3 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<4x32xf32>
 # CHECK-NEXT:       } {"./i"}
 # CHECK-NEXT:       scf.yield %4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./k"}
-# CHECK-NEXT:     %collapse_shape = memref.collapse_shape %2 [[0, 1]] : memref<4x32xf32> into memref<128xf32>
-# CHECK-NEXT:     %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<128xf32>
-# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:     %c128 = arith.constant 128 : index
-# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_2 to %c128 step %c1_3 iter_args(%arg4 = %alloca_1) -> (memref<128xf32>) {
-# CHECK-NEXT:       %subview = memref.subview %collapse_shape[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
-# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
-# CHECK-NEXT:       linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel"]} ins(%subview, %cst_0 : memref<1xf32, strided<[1], offset: ?>>, f32) outs(%subview_4 : memref<1xf32, strided<[1], offset: ?>>) attrs =  {__xtc_id_relu_} {
-# CHECK-NEXT:       ^bb0(%in: f32, %in_6: f32, %out: f32):
-# CHECK-NEXT:         %4 = arith.maximumf %in, %in_6 : f32
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%subview, %cst : memref<1x32xf32, strided<[32, 1], offset: ?>>, f32) outs(%subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_2: f32, %out: f32):
+# CHECK-NEXT:         %4 = arith.maximumf %in, %in_2 : f32
 # CHECK-NEXT:         linalg.yield %4 : f32
 # CHECK-NEXT:       }
-# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3] [1] [1] : memref<128xf32> to memref<1xf32, strided<[1], offset: ?>>
-# CHECK-NEXT:       memref.copy %subview_4, %subview_5 : memref<1xf32, strided<[1], offset: ?>> to memref<1xf32, strided<[1], offset: ?>>
-# CHECK-NEXT:       scf.yield %arg4 : memref<128xf32>
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %subview_0, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %expand_shape = memref.expand_shape %3 [[0, 1]] output_shape [4, 32] : memref<128xf32> into memref<4x32xf32>
-# CHECK-NEXT:     memref.copy %expand_shape, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     memref.copy %3, %arg2 : memref<4x32xf32> to memref<4x32xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
index 255c463df..d9c94661b 100644
--- a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
@@ -426,170 +426,100 @@
 # CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c5 = arith.constant 5 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c12 = arith.constant 12 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
-# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) {
-# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
-# CHECK-NEXT:       %c12 = arith.constant 12 : index
-# CHECK-NEXT:       %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_9 to %c12 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:         %c12_14 = arith.constant 12 : index
-# CHECK-NEXT:         %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_13 to %c12_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
-# CHECK-NEXT:           %c3 = arith.constant 3 : index
-# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c3 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
-# CHECK-NEXT:             %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:             linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
-# CHECK-NEXT:             %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:             memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           } {"./c"}
-# CHECK-NEXT:           %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %5, %subview_20 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %4, %subview_16 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %3, %subview_11 : memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x12x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c12 step %c1 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) {
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         } {"./c"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %4, %subview_4 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_2 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x12x12x3xf32>
-# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     } {"./h"}
 # CHECK-NEXT:     %subview = memref.subview %0[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) {
-# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
-# CHECK-NEXT:       %c4 = arith.constant 4 : index
-# CHECK-NEXT:       %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_9 to %c4 step %c1_10 iter_args(%arg6 = %subview_8) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_12 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_14 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_13 to %c4_14 step %c1_15 iter_args(%arg8 = %subview_12) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_17 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
-# CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %subview_17) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:             %subview_21 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%subview_21 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
-# CHECK-NEXT:             %subview_22 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             memref.copy %subview_21, %subview_22 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %subview_20 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %5, %subview_20 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %4, %subview_16 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %subview_11 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %3, %subview_11 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) {
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.fill {__xtc_id_conv_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %4, %subview_4 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_2 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x4x4x16xf32>
-# CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
-# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
-# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) {
-# CHECK-NEXT:       %subview_8 = memref.subview %0[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:       %subview_10 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
-# CHECK-NEXT:       %c4 = arith.constant 4 : index
-# CHECK-NEXT:       %c1_12 = arith.constant 1 : index
-# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_11 to %c4 step %c1_12 iter_args(%arg6 = %subview_10) -> (memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
-# CHECK-NEXT:         %subview_14 = memref.subview %subview_8[0, %4, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         %subview_15 = memref.subview %subview_9[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:         %subview_16 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_18 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:         %5 = scf.for %arg7 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg8 = %subview_16) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:           %6 = affine.apply #map(%arg7)
-# CHECK-NEXT:           %subview_21 = memref.subview %subview_14[0, 0, %6, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           %subview_22 = memref.subview %subview_15[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>>
-# CHECK-NEXT:           %subview_23 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           %c0_24 = arith.constant 0 : index
-# CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_25 = arith.constant 1 : index
-# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_24 to %c16 step %c1_25 iter_args(%arg10 = %subview_23) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:             %subview_27 = memref.subview %subview_21[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:             %subview_28 = memref.subview %subview_22[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32, strided<[240, 48, 16, 1]>> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:             %subview_29 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             %c0_30 = arith.constant 0 : index
-# CHECK-NEXT:             %c5 = arith.constant 5 : index
-# CHECK-NEXT:             %c1_31 = arith.constant 1 : index
-# CHECK-NEXT:             %8 = scf.for %arg11 = %c0_30 to %c5 step %c1_31 iter_args(%arg12 = %subview_29) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:               %subview_33 = memref.subview %subview_27[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:               %subview_34 = memref.subview %subview_28[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:               %subview_35 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:               %c0_36 = arith.constant 0 : index
-# CHECK-NEXT:               %c5_37 = arith.constant 5 : index
-# CHECK-NEXT:               %c1_38 = arith.constant 1 : index
-# CHECK-NEXT:               %9 = scf.for %arg13 = %c0_36 to %c5_37 step %c1_38 iter_args(%arg14 = %subview_35) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:                 %subview_40 = memref.subview %subview_33[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_41 = memref.subview %subview_34[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %subview_42 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                 %c0_43 = arith.constant 0 : index
-# CHECK-NEXT:                 %c3 = arith.constant 3 : index
-# CHECK-NEXT:                 %c1_44 = arith.constant 1 : index
-# CHECK-NEXT:                 %10 = scf.for %arg15 = %c0_43 to %c3 step %c1_44 iter_args(%arg16 = %subview_42) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:                   %subview_46 = memref.subview %subview_40[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_47 = memref.subview %subview_41[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                   %subview_48 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                   linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_47 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%subview_48 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                   ^bb0(%in: f32, %in_50: f32, %out: f32):
-# CHECK-NEXT:                     %11 = arith.mulf %in, %in_50 : f32
-# CHECK-NEXT:                     %12 = arith.addf %out, %11 : f32
-# CHECK-NEXT:                     linalg.yield %12 : f32
-# CHECK-NEXT:                   }
-# CHECK-NEXT:                   %subview_49 = memref.subview %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                   memref.copy %subview_48, %subview_49 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                   scf.yield %arg16 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                 } {"./c"}
-# CHECK-NEXT:                 %subview_45 = memref.subview %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:                 memref.copy %10, %subview_45 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     %subview_0 = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) {
+# CHECK-NEXT:       %3 = affine.apply #map(%arg3)
+# CHECK-NEXT:       %subview_1 = memref.subview %subview_0[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_2) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %5 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %subview_4 = memref.subview %subview_1[0, 0, %5, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %6 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_5) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_7 = memref.subview %arg1[0, 0, 0, %arg7] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0 to %c5 step %c1 iter_args(%arg10 = %subview_8) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_10 = memref.subview %subview_4[0, %arg9, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_11 = memref.subview %subview_7[%arg9, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %8 = scf.for %arg11 = %c0 to %c5 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_12 = memref.subview %subview_10[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_13 = memref.subview %subview_11[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                 %subview_14 = memref.subview %subview_12[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_15 = memref.subview %subview_13[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_16: f32, %out: f32):
+# CHECK-NEXT:                   %10 = arith.mulf %in, %in_16 : f32
+# CHECK-NEXT:                   %11 = arith.addf %out, %10 : f32
+# CHECK-NEXT:                   linalg.yield %11 : f32
+# CHECK-NEXT:                 }
 # CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:               } {"./s"}
-# CHECK-NEXT:               %subview_39 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:               memref.copy %9, %subview_39 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:               scf.yield %arg12 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             } {"./r"}
-# CHECK-NEXT:             %subview_32 = memref.subview %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             memref.copy %8, %subview_32 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:             scf.yield %arg10 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %subview_26 = memref.subview %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %7, %subview_26 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %5, %subview_20 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %subview_13 = memref.subview %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %3, %subview_13 : memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x4x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:               } {"./c"}
+# CHECK-NEXT:               scf.yield %9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             } {"./s"}
+# CHECK-NEXT:             scf.yield %8 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           } {"./r"}
+# CHECK-NEXT:           %subview_9 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %7, %subview_9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %6, %subview_6 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_3 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x4x4x16xf32>
-# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     } {"./h"}
 # CHECK-NEXT:     memref.copy %2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
index b05c8a8d7..e882104ab 100644
--- a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
@@ -286,109 +286,78 @@
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0 = arith.constant 0 : index
-# CHECK-NEXT:     %c16 = arith.constant 16 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_17 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %subview = memref.subview %0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
 # CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
-# CHECK-NEXT:     %c16_3 = arith.constant 16 : index
-# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0_2 to %c16_3 step %c1_4 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_17 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_B_pad_0_} ins(%cst_1 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_1) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_B_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview_5 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %arg1, %subview_5 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     %alloca_6 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %cst_7 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:     %c16_9 = arith.constant 16 : index
-# CHECK-NEXT:     %c1_10 = arith.constant 1 : index
-# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %alloca_6) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_15 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_17 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %subview_15) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_20 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_7 : f32) outs(%subview_20 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_21 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_20, %subview_21 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %subview_2 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_19 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_19 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %c0_11 = arith.constant 0 : index
-# CHECK-NEXT:     %c16_12 = arith.constant 16 : index
-# CHECK-NEXT:     %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_11 to %c16_12 step %c1_13 iter_args(%arg4 = %2) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_15 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %subview_16 = memref.subview %1[0, 0] [16, 16] [1, 1] : memref<16x16xf32> to memref<16x16xf32, strided<[16, 1]>>
-# CHECK-NEXT:       %subview_17 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %c0_18 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_19 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0_18 to %c16_19 step %c1_20 iter_args(%arg6 = %subview_17) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_22 = memref.subview %subview_15[0, 0] [1, 16] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %subview_23 = memref.subview %subview_16[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32, strided<[16, 1]>> to memref<16x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %subview_24 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %c0_25 = arith.constant 0 : index
-# CHECK-NEXT:         %c16_26 = arith.constant 16 : index
-# CHECK-NEXT:         %c1_27 = arith.constant 1 : index
-# CHECK-NEXT:         %5 = scf.for %arg7 = %c0_25 to %c16_26 step %c1_27 iter_args(%arg8 = %subview_24) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_29 = memref.subview %subview_22[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           %subview_30 = memref.subview %subview_23[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           %subview_31 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_29, %subview_30 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%subview_31 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:           %subview_32 = memref.subview %arg8[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_31, %subview_32 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %2) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_4 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_5) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_7 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_8) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_10 = memref.subview %subview_4[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_11 = memref.subview %subview_7[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_10, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:           scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
-# CHECK-NEXT:         %subview_28 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %5, %subview_28 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_21 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_21 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_6 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview_14 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %subview_14, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
+# CHECK-NEXT:     %subview_3 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %subview_3, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
index 7a9eb3442..ac4cee48b 100644
--- a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
@@ -299,113 +299,77 @@
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
-# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
-# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0 = arith.constant 0 : index
-# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
 # CHECK-NEXT:     %0 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %alloca) -> (memref<4x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
-# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg5 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
-# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
-# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %0) -> (memref<4x32xf32>) {
+# CHECK-NEXT:     %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (memref<4x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:       %subview_9 = memref.subview %arg1[0, 0] [512, 32] [1, 1] : memref<512x32xf32> to memref<512x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
-# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_15 = memref.subview %subview[0, 0] [1, 512] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x512xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:         %subview_16 = memref.subview %subview_9[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32, strided<[32, 1]>> to memref<512x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %c0_18 = arith.constant 0 : index
-# CHECK-NEXT:         %c512 = arith.constant 512 : index
-# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
-# CHECK-NEXT:         %5 = scf.for %arg8 = %c0_18 to %c512 step %c1_19 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_21 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
-# CHECK-NEXT:           %subview_22 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_23 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_D_} ins(%subview_21, %subview_22 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_23 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:           %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_23, %subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg1[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg8 = %c0 to %c512 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_D_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:           scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
-# CHECK-NEXT:         %subview_20 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %5, %subview_20 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg5 : memref<4x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %cst_3 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0_4 = arith.constant 0 : index
-# CHECK-NEXT:     %c32 = arith.constant 32 : index
-# CHECK-NEXT:     %c1_5 = arith.constant 1 : index
-# CHECK-NEXT:     %2 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) {
+# CHECK-NEXT:     %2 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
-# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_13 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%subview_13 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_14 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_13, %subview_14 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_E_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_12 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_12 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg5 : memref<32x32xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
-# CHECK-NEXT:     %c32_7 = arith.constant 32 : index
-# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
-# CHECK-NEXT:     %3 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %2) -> (memref<32x32xf32>) {
+# CHECK-NEXT:     %3 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %2) -> (memref<32x32xf32>) {
 # CHECK-NEXT:       %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>>
-# CHECK-NEXT:       %subview_9 = memref.subview %1[0, 0] [4, 32] [1, 1] : memref<4x32xf32> to memref<4x32xf32, strided<[32, 1]>>
-# CHECK-NEXT:       %subview_10 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
-# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
-# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
-# CHECK-NEXT:       %4 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %subview_10) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_15 = memref.subview %subview[0, 0] [1, 4] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x4xf32, strided<[4, 1], offset: ?>>
-# CHECK-NEXT:         %subview_16 = memref.subview %subview_9[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32, strided<[32, 1]>> to memref<4x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %subview_17 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         %c0_18 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_19 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_20 = arith.constant 1 : index
-# CHECK-NEXT:         %5 = scf.for %arg8 = %c0_18 to %c4_19 step %c1_20 iter_args(%arg9 = %subview_17) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_22 = memref.subview %subview_15[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>>
-# CHECK-NEXT:           %subview_23 = memref.subview %subview_16[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           %subview_24 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_E_} ins(%subview_22, %subview_23 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%subview_24 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
-# CHECK-NEXT:           %subview_25 = memref.subview %arg9[0, 0] [1, 1] [1, 1] : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_24, %subview_25 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %1[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32> to memref<4x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_E_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
 # CHECK-NEXT:           scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
-# CHECK-NEXT:         %subview_21 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %5, %subview_21 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_14 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_14 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg5 : memref<32x32xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     memref.copy %3, %arg3 : memref<32x32xf32> to memref<32x32xf32>

From 036a18e79f61e8cfaa0b90f78f542a90f213273b Mon Sep 17 00:00:00 2001
From: Liam Semeria <liamsemeria@gmail.com>
Date: Tue, 24 Feb 2026 12:12:30 +0100
Subject: [PATCH 13/14] tensor-dialect: removed macos workaround

---
 src/xtc/backends/mlir/MlirCompilerPasses.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index 804adce6b..08835ade2 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -26,7 +26,6 @@
     OpResult,
 )
 from mlir.passmanager import PassManager
-import platform
 
 # Import SDist if available
 try:
@@ -557,10 +556,8 @@ def apply_bufferization_passes(mlir_program: RawMlirProgram):
     bufferize_options = [
         "bufferize-function-boundaries=1",
         "function-boundary-type-conversion=identity-layout-map",
+        "buffer-alignment=256",
     ]
-    # TODO: below is needed until macos mlir is updated
-    if platform.system() != "Darwin":
-        bufferize_options.append("buffer-alignment=256")
     apply_passes.run(
         [
             "canonicalize",

From 0bc7d2ab9a1986f212ddd77e41db8853602d0c46 Mon Sep 17 00:00:00 2001
From: Liam Semeria <liamsemeria@gmail.com>
Date: Wed, 4 Mar 2026 14:51:35 +0100
Subject: [PATCH 14/14] tensor-dialect: updated to tensor.pad, doesnt work for
 c :(

---
 src/xtc/backends/mlir/MlirOps.py              |  44 +-
 .../test_pad_conv2d_mlir_tensor.py            | 923 +++++++++++++-----
 .../test_pad_matmul_unpad_mlir_tensor.py      | 759 ++++++++++----
 3 files changed, 1232 insertions(+), 494 deletions(-)

diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index 836f7d2e7..48e0a2f19 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -11,6 +11,7 @@
 from xdsl.dialects.builtin import (
     MemRefType,
     TensorType,
+    IndexType,
     f32,
     f64,
     i64,
@@ -529,19 +530,25 @@ def generate_op(
         dims_value = list(self.args[:-1])
         padding = self.attrs["padding"]
         constant_value = self.attrs["constant_value"]
+        lows = [0] * len(dims_value)
+        highs = [0] * len(dims_value)
         if isinstance(padding, dict):
             dims_value_before_pad = list(dims_value)
             for i, pad_value in padding.items():
                 dims_value_before_pad[i] -= sum(pad_value)
+                lows[i] = pad_value[0]
+                highs[i] = pad_value[1]
         else:
             dims_value_before_pad = [
                 dim_value - sum(padding) for dim_value in dims_value
             ]
+            lows = [padding[0] for d in dims_value]
+            highs = [padding[1] for d in dims_value]
         elt_type = {"float32": f32, "float64": f64}[dtype]
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape)
+                MemRefType(elt_type, shape)  # should be op_type here??
                 for shape in [dims_value_before_pad, dims_value]
             ]
             block = Block(arg_types=ops_types)
@@ -560,20 +567,28 @@ def generate_op(
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size))
             result = (args[1].type,) if using_tensors else ()
-            fill = linalg.FillOp(
-                res=result,
-                inputs=(cst0.results[0],),
-                outputs=(args[1],),
-            )
+            fill_node_id = f"{self.name}_0"
             if using_tensors:
-                copy = tensor.InsertSliceOp.from_static_parameters(
+                fill = None
+                block_in = Block(arg_types=[IndexType()] * len(dims_value))
+                with ImplicitBuilder(block_in):
+                    tensor.YieldOp(cst0)
+                copy = tensor.PadOp(
                     source=args[0],
-                    dest=fill.results[0],
-                    offsets=offsets,
-                    sizes=sizes,
-                    strides=strides,
+                    region=Region([block_in]),
+                    low=[],
+                    high=[],
+                    nofold=UnitAttr(),
+                    result_type=TensorType(elt_type, dims_value),
+                    static_low=lows,
+                    static_high=highs,
                 )
             else:
+                fill = linalg.FillOp(
+                    res=result,
+                    inputs=(cst0.results[0],),
+                    outputs=(args[1],),
+                )
                 subview = memref.SubviewOp.from_static_parameters(
                     source=args[1],
                     source_type=args[1].type,  # type: ignore
@@ -586,14 +601,13 @@ def generate_op(
                     outputs=[subview.result],
                     res=result,
                 )
-        fill_node_id = f"{self.name}_0"
-        fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
+                fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
         copy_node_id = f"{self.name}"
         copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr()
         attrs = {
             "nodes_map": {
-                fill_node_id: fill,
-                copy_node_id: None if using_tensors else copy,
+                **({fill_node_id: fill} if fill else {}),
+                copy_node_id: copy,
             },
             "dims_sizes": [
                 self.dims_sizes(),
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
index d9c94661b..2ebcefb39 100644
--- a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
@@ -41,18 +41,20 @@
 # CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%0 : tensor<1x12x12x3xf32>) -> tensor<1x12x12x3xf32>
-# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32>
-# CHECK-NEXT:     %2 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %padded = tensor.pad %arg0 nofold low[0, 2, 2, 0] high[0, 2, 2, 0] {
+# CHECK-NEXT:     ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
+# CHECK-NEXT:       tensor.yield %cst : f32
+# CHECK-NEXT:     } {__xtc_id_pad_} : tensor<1x8x8x3xf32> to tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %3 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%2 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
-# CHECK-NEXT:     %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%inserted_slice, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%3 : tensor<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:     %2 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%1 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%padded, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%2 : tensor<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
 # CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
-# CHECK-NEXT:       %5 = arith.mulf %in, %in_1 : f32
-# CHECK-NEXT:       %6 = arith.addf %out, %5 : f32
-# CHECK-NEXT:       linalg.yield %6 : f32
+# CHECK-NEXT:       %4 = arith.mulf %in, %in_1 : f32
+# CHECK-NEXT:       %5 = arith.addf %out, %4 : f32
+# CHECK-NEXT:       linalg.yield %5 : f32
 # CHECK-NEXT:     } -> tensor<1x4x4x16xf32>
-# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
@@ -60,7 +62,7 @@
 # CHECK-NEXT:     transform.yield 
 # CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops "./b" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -98,332 +100,631 @@
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
-# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
-# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)>
+# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)>
+# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)>
+# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)>
+# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)>
+# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)>
+# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)>
+# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)>
+# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_0 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) {
-# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) {
 # CHECK-NEXT:       %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:       %c12 = arith.constant 12 : index
-# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) {
-# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32>
-# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:         %c12_13 = arith.constant 12 : index
-# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) {
-# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32>
-# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
-# CHECK-NEXT:           %c3 = arith.constant 3 : index
-# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) {
-# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
-# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32>
-# CHECK-NEXT:           } {"./c"}
-# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %6 = arith.cmpi eq, %c8, %c0_8 : index
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c8_10 = arith.constant 8 : index
+# CHECK-NEXT:       %7 = arith.cmpi eq, %c8_10, %c0_9 : index
+# CHECK-NEXT:       %8 = arith.ori %7, %6 : i1
+# CHECK-NEXT:       %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32>
+# CHECK-NEXT:         %10 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c12 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:           %12 = affine.apply #map(%arg5)
+# CHECK-NEXT:           %13 = affine.max #map1(%arg5)
+# CHECK-NEXT:           %14 = affine.apply #map2(%arg5)
+# CHECK-NEXT:           %15 = affine.max #map3(%arg5)
+# CHECK-NEXT:           %16 = affine.min #map4(%15)
+# CHECK-NEXT:           %17 = affine.apply #map5(%13)
+# CHECK-NEXT:           %18 = affine.apply #map6(%16)
+# CHECK-NEXT:           %19 = affine.min #map7(%16, %13)
+# CHECK-NEXT:           %20 = affine.max #map8(%19)
+# CHECK-NEXT:           %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = arith.cmpi eq, %20, %c0_13 : index
+# CHECK-NEXT:           %22 = affine.apply #map5(%20)
+# CHECK-NEXT:           %23 = affine.apply #map9(%13, %20)
+# CHECK-NEXT:           %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:           %c8_15 = arith.constant 8 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %c8_15, %c0_14 : index
+# CHECK-NEXT:           %25 = arith.ori %24, %21 : i1
+# CHECK-NEXT:           %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32>
+# CHECK-NEXT:             %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:             %27 = tensor.empty() : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:             %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:             %c12_21 = arith.constant 12 : index
+# CHECK-NEXT:             %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:             %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:               %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:               %29 = affine.max #map10(%13)
+# CHECK-NEXT:               %30 = affine.apply #map11(%13)
+# CHECK-NEXT:               %31 = affine.max #map12(%13)
+# CHECK-NEXT:               %32 = affine.min #map13(%31, %20)
+# CHECK-NEXT:               %33 = affine.apply #map5(%29)
+# CHECK-NEXT:               %34 = affine.apply #map14(%20, %32)
+# CHECK-NEXT:               %35 = affine.min #map15(%20, %32, %29)
+# CHECK-NEXT:               %36 = affine.max #map8(%35)
+# CHECK-NEXT:               %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:               %37 = arith.cmpi eq, %36, %c0_24 : index
+# CHECK-NEXT:               %38 = affine.apply #map5(%36)
+# CHECK-NEXT:               %39 = affine.apply #map9(%29, %36)
+# CHECK-NEXT:               %40 = affine.apply #map(%arg7)
+# CHECK-NEXT:               %41 = affine.max #map1(%arg7)
+# CHECK-NEXT:               %42 = affine.apply #map2(%arg7)
+# CHECK-NEXT:               %43 = affine.max #map3(%arg7)
+# CHECK-NEXT:               %44 = affine.min #map4(%43)
+# CHECK-NEXT:               %45 = affine.apply #map5(%41)
+# CHECK-NEXT:               %46 = affine.apply #map6(%44)
+# CHECK-NEXT:               %47 = affine.min #map7(%44, %41)
+# CHECK-NEXT:               %48 = affine.max #map8(%47)
+# CHECK-NEXT:               %c0_25 = arith.constant 0 : index
+# CHECK-NEXT:               %49 = arith.cmpi eq, %48, %c0_25 : index
+# CHECK-NEXT:               %50 = arith.ori %49, %37 : i1
+# CHECK-NEXT:               %51 = affine.apply #map5(%48)
+# CHECK-NEXT:               %52 = affine.apply #map9(%41, %48)
+# CHECK-NEXT:               %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                 %generated = tensor.generate  {
+# CHECK-NEXT:                 ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
+# CHECK-NEXT:                   tensor.yield %cst : f32
+# CHECK-NEXT:                 } : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %generated : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               } else {
+# CHECK-NEXT:                 %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32>
+# CHECK-NEXT:                 %c1_28 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2 = arith.constant 2 : index
+# CHECK-NEXT:                 %54 = tensor.empty() : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2_30 = arith.constant 2 : index
+# CHECK-NEXT:                 %c0_31 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:                 %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                   %c1_34 = arith.constant 1 : index
+# CHECK-NEXT:                   %56 = affine.max #map10(%29)
+# CHECK-NEXT:                   %57 = affine.apply #map11(%29)
+# CHECK-NEXT:                   %58 = affine.max #map12(%29)
+# CHECK-NEXT:                   %59 = affine.min #map13(%58, %36)
+# CHECK-NEXT:                   %60 = affine.apply #map5(%56)
+# CHECK-NEXT:                   %61 = affine.apply #map14(%36, %59)
+# CHECK-NEXT:                   %62 = affine.min #map15(%36, %59, %56)
+# CHECK-NEXT:                   %63 = affine.max #map8(%62)
+# CHECK-NEXT:                   %c0_35 = arith.constant 0 : index
+# CHECK-NEXT:                   %64 = arith.cmpi eq, %63, %c0_35 : index
+# CHECK-NEXT:                   %65 = affine.apply #map5(%63)
+# CHECK-NEXT:                   %66 = affine.apply #map9(%56, %63)
+# CHECK-NEXT:                   %c2_36 = arith.constant 2 : index
+# CHECK-NEXT:                   %67 = affine.max #map10(%41)
+# CHECK-NEXT:                   %68 = affine.apply #map11(%41)
+# CHECK-NEXT:                   %69 = affine.max #map12(%41)
+# CHECK-NEXT:                   %70 = affine.min #map13(%69, %48)
+# CHECK-NEXT:                   %71 = affine.apply #map5(%67)
+# CHECK-NEXT:                   %72 = affine.apply #map14(%48, %70)
+# CHECK-NEXT:                   %73 = affine.min #map15(%48, %70, %67)
+# CHECK-NEXT:                   %74 = affine.max #map8(%73)
+# CHECK-NEXT:                   %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                   %75 = arith.cmpi eq, %74, %c0_37 : index
+# CHECK-NEXT:                   %76 = arith.ori %75, %64 : i1
+# CHECK-NEXT:                   %77 = affine.apply #map5(%74)
+# CHECK-NEXT:                   %78 = affine.apply #map9(%67, %74)
+# CHECK-NEXT:                   %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                     %generated = tensor.generate  {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %generated : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   } else {
+# CHECK-NEXT:                     %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %cast_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                   %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %cast_33 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             } {"./w"}
+# CHECK-NEXT:             %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %cast : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         } {"./h"}
+# CHECK-NEXT:         scf.yield %11 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32>
-# CHECK-NEXT:     %2 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0_2 = arith.constant 0 : index
 # CHECK-NEXT:     %c1_3 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
 # CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
 # CHECK-NEXT:       %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:       %c4 = arith.constant 4 : index
 # CHECK-NEXT:       %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
-# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_13 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) {
-# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
-# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_12 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) {
-# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
-# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:           %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32>
 # CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:           %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32>
 # CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32>
 # CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     } {"./b"}
 # CHECK-NEXT:     %c0_5 = arith.constant 0 : index
 # CHECK-NEXT:     %c1_6 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
-# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
 # CHECK-NEXT:       %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
 # CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
 # CHECK-NEXT:       %c0_10 = arith.constant 0 : index
 # CHECK-NEXT:       %c4 = arith.constant 4 : index
 # CHECK-NEXT:       %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
-# CHECK-NEXT:         %6 = affine.apply #map(%arg5)
-# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
-# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
-# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
-# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_17 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) {
-# CHECK-NEXT:           %8 = affine.apply #map(%arg7)
-# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
-# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
-# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
-# CHECK-NEXT:           %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %7 = affine.apply #map16(%arg5)
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_16 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:         %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %9 = affine.apply #map16(%arg7)
+# CHECK-NEXT:           %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_22 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_24 = arith.constant 1 : index
-# CHECK-NEXT:           %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) {
-# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
-# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
-# CHECK-NEXT:             %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:           %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:           %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_28 = arith.constant 0 : index
 # CHECK-NEXT:             %c5 = arith.constant 5 : index
-# CHECK-NEXT:             %c1_30 = arith.constant 1 : index
-# CHECK-NEXT:             %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) {
-# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
-# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
-# CHECK-NEXT:               %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:               %c0_35 = arith.constant 0 : index
-# CHECK-NEXT:               %c5_36 = arith.constant 5 : index
-# CHECK-NEXT:               %c1_37 = arith.constant 1 : index
-# CHECK-NEXT:               %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) {
-# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
-# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
-# CHECK-NEXT:                 %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                 %c0_42 = arith.constant 0 : index
+# CHECK-NEXT:             %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:             %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
+# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_34 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_35 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:               %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_41 = arith.constant 0 : index
 # CHECK-NEXT:                 %c3 = arith.constant 3 : index
-# CHECK-NEXT:                 %c1_43 = arith.constant 1 : index
-# CHECK-NEXT:                 %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) {
-# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                   ^bb0(%in: f32, %in_49: f32, %out: f32):
-# CHECK-NEXT:                     %14 = arith.mulf %in, %in_49 : f32
-# CHECK-NEXT:                     %15 = arith.addf %out, %14 : f32
-# CHECK-NEXT:                     linalg.yield %15 : f32
+# CHECK-NEXT:                 %c1_42 = arith.constant 1 : index
+# CHECK-NEXT:                 %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_48: f32, %out: f32):
+# CHECK-NEXT:                     %15 = arith.mulf %in, %in_48 : f32
+# CHECK-NEXT:                     %16 = arith.addf %out, %15 : f32
+# CHECK-NEXT:                     linalg.yield %16 : f32
 # CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32>
 # CHECK-NEXT:                 } {"./c"}
-# CHECK-NEXT:                 %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
-# CHECK-NEXT:                 scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32>
 # CHECK-NEXT:               } {"./s"}
-# CHECK-NEXT:               %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
-# CHECK-NEXT:               scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32>
 # CHECK-NEXT:             } {"./r"}
-# CHECK-NEXT:             %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
-# CHECK-NEXT:             scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:             %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32>
 # CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32>
 # CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32>
 # CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
-# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
-# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)>
+# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)>
+# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)>
+# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)>
+# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)>
+# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)>
+# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)>
+# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)>
+# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_0 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x12x12x3xf32>) {
-# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) {
 # CHECK-NEXT:       %c0_8 = arith.constant 0 : index
-# CHECK-NEXT:       %c12 = arith.constant 12 : index
-# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c12 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x12x12x3xf32>) {
-# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x1x12x3xf32>
-# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:         %c12_13 = arith.constant 12 : index
-# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c12_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x12x3xf32>) {
-# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> to tensor<1x1x1x3xf32>
-# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
-# CHECK-NEXT:           %c3 = arith.constant 3 : index
-# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c3 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x3xf32>) {
-# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
-# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x3xf32>
-# CHECK-NEXT:           } {"./c"}
-# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x12x3xf32>
-# CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x12x12x3xf32>
-# CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %6 = arith.cmpi eq, %c8, %c0_8 : index
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c8_10 = arith.constant 8 : index
+# CHECK-NEXT:       %7 = arith.cmpi eq, %c8_10, %c0_9 : index
+# CHECK-NEXT:       %8 = arith.ori %7, %6 : i1
+# CHECK-NEXT:       %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32>
+# CHECK-NEXT:         %10 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c12 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:           %12 = affine.apply #map(%arg5)
+# CHECK-NEXT:           %13 = affine.max #map1(%arg5)
+# CHECK-NEXT:           %14 = affine.apply #map2(%arg5)
+# CHECK-NEXT:           %15 = affine.max #map3(%arg5)
+# CHECK-NEXT:           %16 = affine.min #map4(%15)
+# CHECK-NEXT:           %17 = affine.apply #map5(%13)
+# CHECK-NEXT:           %18 = affine.apply #map6(%16)
+# CHECK-NEXT:           %19 = affine.min #map7(%16, %13)
+# CHECK-NEXT:           %20 = affine.max #map8(%19)
+# CHECK-NEXT:           %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = arith.cmpi eq, %20, %c0_13 : index
+# CHECK-NEXT:           %22 = affine.apply #map5(%20)
+# CHECK-NEXT:           %23 = affine.apply #map9(%13, %20)
+# CHECK-NEXT:           %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:           %c8_15 = arith.constant 8 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %c8_15, %c0_14 : index
+# CHECK-NEXT:           %25 = arith.ori %24, %21 : i1
+# CHECK-NEXT:           %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32>
+# CHECK-NEXT:             %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:             %27 = tensor.empty() : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:             %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:             %c12_21 = arith.constant 12 : index
+# CHECK-NEXT:             %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:             %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:               %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:               %29 = affine.max #map10(%13)
+# CHECK-NEXT:               %30 = affine.apply #map11(%13)
+# CHECK-NEXT:               %31 = affine.max #map12(%13)
+# CHECK-NEXT:               %32 = affine.min #map13(%31, %20)
+# CHECK-NEXT:               %33 = affine.apply #map5(%29)
+# CHECK-NEXT:               %34 = affine.apply #map14(%20, %32)
+# CHECK-NEXT:               %35 = affine.min #map15(%20, %32, %29)
+# CHECK-NEXT:               %36 = affine.max #map8(%35)
+# CHECK-NEXT:               %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:               %37 = arith.cmpi eq, %36, %c0_24 : index
+# CHECK-NEXT:               %38 = affine.apply #map5(%36)
+# CHECK-NEXT:               %39 = affine.apply #map9(%29, %36)
+# CHECK-NEXT:               %40 = affine.apply #map(%arg7)
+# CHECK-NEXT:               %41 = affine.max #map1(%arg7)
+# CHECK-NEXT:               %42 = affine.apply #map2(%arg7)
+# CHECK-NEXT:               %43 = affine.max #map3(%arg7)
+# CHECK-NEXT:               %44 = affine.min #map4(%43)
+# CHECK-NEXT:               %45 = affine.apply #map5(%41)
+# CHECK-NEXT:               %46 = affine.apply #map6(%44)
+# CHECK-NEXT:               %47 = affine.min #map7(%44, %41)
+# CHECK-NEXT:               %48 = affine.max #map8(%47)
+# CHECK-NEXT:               %c0_25 = arith.constant 0 : index
+# CHECK-NEXT:               %49 = arith.cmpi eq, %48, %c0_25 : index
+# CHECK-NEXT:               %50 = arith.ori %49, %37 : i1
+# CHECK-NEXT:               %51 = affine.apply #map5(%48)
+# CHECK-NEXT:               %52 = affine.apply #map9(%41, %48)
+# CHECK-NEXT:               %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                 %generated = tensor.generate  {
+# CHECK-NEXT:                 ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
+# CHECK-NEXT:                   tensor.yield %cst : f32
+# CHECK-NEXT:                 } : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %generated : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               } else {
+# CHECK-NEXT:                 %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32>
+# CHECK-NEXT:                 %c1_28 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2 = arith.constant 2 : index
+# CHECK-NEXT:                 %54 = tensor.empty() : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2_30 = arith.constant 2 : index
+# CHECK-NEXT:                 %c0_31 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:                 %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                   %c1_34 = arith.constant 1 : index
+# CHECK-NEXT:                   %56 = affine.max #map10(%29)
+# CHECK-NEXT:                   %57 = affine.apply #map11(%29)
+# CHECK-NEXT:                   %58 = affine.max #map12(%29)
+# CHECK-NEXT:                   %59 = affine.min #map13(%58, %36)
+# CHECK-NEXT:                   %60 = affine.apply #map5(%56)
+# CHECK-NEXT:                   %61 = affine.apply #map14(%36, %59)
+# CHECK-NEXT:                   %62 = affine.min #map15(%36, %59, %56)
+# CHECK-NEXT:                   %63 = affine.max #map8(%62)
+# CHECK-NEXT:                   %c0_35 = arith.constant 0 : index
+# CHECK-NEXT:                   %64 = arith.cmpi eq, %63, %c0_35 : index
+# CHECK-NEXT:                   %65 = affine.apply #map5(%63)
+# CHECK-NEXT:                   %66 = affine.apply #map9(%56, %63)
+# CHECK-NEXT:                   %c2_36 = arith.constant 2 : index
+# CHECK-NEXT:                   %67 = affine.max #map10(%41)
+# CHECK-NEXT:                   %68 = affine.apply #map11(%41)
+# CHECK-NEXT:                   %69 = affine.max #map12(%41)
+# CHECK-NEXT:                   %70 = affine.min #map13(%69, %48)
+# CHECK-NEXT:                   %71 = affine.apply #map5(%67)
+# CHECK-NEXT:                   %72 = affine.apply #map14(%48, %70)
+# CHECK-NEXT:                   %73 = affine.min #map15(%48, %70, %67)
+# CHECK-NEXT:                   %74 = affine.max #map8(%73)
+# CHECK-NEXT:                   %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                   %75 = arith.cmpi eq, %74, %c0_37 : index
+# CHECK-NEXT:                   %76 = arith.ori %75, %64 : i1
+# CHECK-NEXT:                   %77 = affine.apply #map5(%74)
+# CHECK-NEXT:                   %78 = affine.apply #map9(%67, %74)
+# CHECK-NEXT:                   %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                     %generated = tensor.generate  {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %generated : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   } else {
+# CHECK-NEXT:                     %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %cast_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                   %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %cast_33 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             } {"./w"}
+# CHECK-NEXT:             %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %cast : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         } {"./h"}
+# CHECK-NEXT:         scf.yield %11 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x12x12x3xf32>
 # CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] {__xtc_id_pad_} : tensor<1x8x8x3xf32> into tensor<1x12x12x3xf32>
-# CHECK-NEXT:     %2 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %c0_2 = arith.constant 0 : index
 # CHECK-NEXT:     %c1_3 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_4 = arith.constant 1 : index
-# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %2) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
 # CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
 # CHECK-NEXT:       %c0_8 = arith.constant 0 : index
 # CHECK-NEXT:       %c4 = arith.constant 4 : index
 # CHECK-NEXT:       %c1_9 = arith.constant 1 : index
-# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_11 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
-# CHECK-NEXT:         %c0_12 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_13 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_14 = arith.constant 1 : index
-# CHECK-NEXT:         %6 = scf.for %arg7 = %c0_12 to %c4_13 step %c1_14 iter_args(%arg8 = %extracted_slice_11) -> (tensor<1x1x4x16xf32>) {
-# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
-# CHECK-NEXT:           %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_12 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:           %7 = scf.for %arg9 = %c0_17 to %c16 step %c1_18 iter_args(%arg10 = %extracted_slice_16) -> (tensor<1x1x1x16xf32>) {
-# CHECK-NEXT:             %extracted_slice_20 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %8 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_20 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %inserted_slice_21 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
-# CHECK-NEXT:             scf.yield %inserted_slice_21 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:           %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32>
 # CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %inserted_slice_19 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_19 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:           %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32>
 # CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %inserted_slice_15 = tensor.insert_slice %6 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_15 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32>
 # CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %inserted_slice_10 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_10 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     } {"./b"}
 # CHECK-NEXT:     %c0_5 = arith.constant 0 : index
 # CHECK-NEXT:     %c1_6 = arith.constant 1 : index
 # CHECK-NEXT:     %c1_7 = arith.constant 1 : index
-# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
-# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %inserted_slice[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
 # CHECK-NEXT:       %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
 # CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
 # CHECK-NEXT:       %c0_10 = arith.constant 0 : index
 # CHECK-NEXT:       %c4 = arith.constant 4 : index
 # CHECK-NEXT:       %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:       %5 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
-# CHECK-NEXT:         %6 = affine.apply #map(%arg5)
-# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice[0, %6, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
-# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
-# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
-# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
-# CHECK-NEXT:         %c4_17 = arith.constant 4 : index
-# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
-# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_16 to %c4_17 step %c1_18 iter_args(%arg8 = %extracted_slice_15) -> (tensor<1x1x4x16xf32>) {
-# CHECK-NEXT:           %8 = affine.apply #map(%arg7)
-# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, %8, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
-# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
-# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
-# CHECK-NEXT:           %c0_23 = arith.constant 0 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %7 = affine.apply #map16(%arg5)
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_16 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:         %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %9 = affine.apply #map16(%arg7)
+# CHECK-NEXT:           %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_22 = arith.constant 0 : index
 # CHECK-NEXT:           %c16 = arith.constant 16 : index
-# CHECK-NEXT:           %c1_24 = arith.constant 1 : index
-# CHECK-NEXT:           %9 = scf.for %arg9 = %c0_23 to %c16 step %c1_24 iter_args(%arg10 = %extracted_slice_22) -> (tensor<1x1x1x16xf32>) {
-# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
-# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
-# CHECK-NEXT:             %extracted_slice_28 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:             %c0_29 = arith.constant 0 : index
+# CHECK-NEXT:           %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:           %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_28 = arith.constant 0 : index
 # CHECK-NEXT:             %c5 = arith.constant 5 : index
-# CHECK-NEXT:             %c1_30 = arith.constant 1 : index
-# CHECK-NEXT:             %10 = scf.for %arg11 = %c0_29 to %c5 step %c1_30 iter_args(%arg12 = %extracted_slice_28) -> (tensor<1x1x1x1xf32>) {
-# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
-# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %extracted_slice_27[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
-# CHECK-NEXT:               %extracted_slice_34 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:               %c0_35 = arith.constant 0 : index
-# CHECK-NEXT:               %c5_36 = arith.constant 5 : index
-# CHECK-NEXT:               %c1_37 = arith.constant 1 : index
-# CHECK-NEXT:               %11 = scf.for %arg13 = %c0_35 to %c5_36 step %c1_37 iter_args(%arg14 = %extracted_slice_34) -> (tensor<1x1x1x1xf32>) {
-# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
-# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %extracted_slice_33[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
-# CHECK-NEXT:                 %extracted_slice_41 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                 %c0_42 = arith.constant 0 : index
+# CHECK-NEXT:             %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:             %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
+# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_34 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_35 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:               %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_41 = arith.constant 0 : index
 # CHECK-NEXT:                 %c3 = arith.constant 3 : index
-# CHECK-NEXT:                 %c1_43 = arith.constant 1 : index
-# CHECK-NEXT:                 %12 = scf.for %arg15 = %c0_42 to %c3 step %c1_43 iter_args(%arg16 = %extracted_slice_41) -> (tensor<1x1x1x1xf32>) {
-# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %extracted_slice_40[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %extracted_slice_47 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_45, %extracted_slice_46 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_47 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
-# CHECK-NEXT:                   ^bb0(%in: f32, %in_49: f32, %out: f32):
-# CHECK-NEXT:                     %14 = arith.mulf %in, %in_49 : f32
-# CHECK-NEXT:                     %15 = arith.addf %out, %14 : f32
-# CHECK-NEXT:                     linalg.yield %15 : f32
+# CHECK-NEXT:                 %c1_42 = arith.constant 1 : index
+# CHECK-NEXT:                 %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_48: f32, %out: f32):
+# CHECK-NEXT:                     %15 = arith.mulf %in, %in_48 : f32
+# CHECK-NEXT:                     %16 = arith.addf %out, %15 : f32
+# CHECK-NEXT:                     linalg.yield %16 : f32
 # CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   %inserted_slice_48 = tensor.insert_slice %13 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
-# CHECK-NEXT:                   scf.yield %inserted_slice_48 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32>
 # CHECK-NEXT:                 } {"./c"}
-# CHECK-NEXT:                 %inserted_slice_44 = tensor.insert_slice %12 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
-# CHECK-NEXT:                 scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32>
 # CHECK-NEXT:               } {"./s"}
-# CHECK-NEXT:               %inserted_slice_38 = tensor.insert_slice %11 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
-# CHECK-NEXT:               scf.yield %inserted_slice_38 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32>
 # CHECK-NEXT:             } {"./r"}
-# CHECK-NEXT:             %inserted_slice_31 = tensor.insert_slice %10 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
-# CHECK-NEXT:             scf.yield %inserted_slice_31 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:             %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32>
 # CHECK-NEXT:           } {"./f"}
-# CHECK-NEXT:           %inserted_slice_25 = tensor.insert_slice %9 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_25 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32>
 # CHECK-NEXT:         } {"./w"}
-# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32>
 # CHECK-NEXT:       } {"./h"}
-# CHECK-NEXT:       %inserted_slice_12 = tensor.insert_slice %5 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_12 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
 # CHECK-NEXT:     } {"./b"}
-# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
-# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
-# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
-# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2, 0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, d0 - 2)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (8, d0)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0, 0)>
+# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d1, d0)>
+# CHECK-NEXT: #map7 = affine_map<(d0, d1, d2) -> (-d2 + 1, d0 - d1)>
+# CHECK-NEXT: #map8 = affine_map<(d0, d1) -> (-d0 - d1 + 1)>
+# CHECK-NEXT: #map9 = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %c5 = arith.constant 5 : index
@@ -435,27 +736,117 @@
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32>
 # CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c12 step %c1 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) {
+# CHECK-NEXT:       %3 = affine.max #map(%arg3)
+# CHECK-NEXT:       %4 = affine.max #map1(%arg3)
+# CHECK-NEXT:       %5 = affine.min #map2(%4)
+# CHECK-NEXT:       %6 = affine.min #map3(%5, %3)
+# CHECK-NEXT:       %7 = affine.max #map4(%6)
+# CHECK-NEXT:       %8 = arith.cmpi eq, %7, %c0 : index
+# CHECK-NEXT:       %9 = scf.if %8 -> (memref<1x1x12x3xf32>) {
+# CHECK-NEXT:         linalg.map outs(%alloca : memref<1x1x12x3xf32>)
+# CHECK-NEXT:           () {
+# CHECK-NEXT:             %10 = linalg.index 0 : index
+# CHECK-NEXT:             %11 = linalg.index 1 : index
+# CHECK-NEXT:             %12 = linalg.index 2 : index
+# CHECK-NEXT:             %13 = linalg.index 3 : index
+# CHECK-NEXT:             linalg.yield %cst : f32
+# CHECK-NEXT:           }
+# CHECK-NEXT:         scf.yield %alloca : memref<1x1x12x3xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg0[0, %5, 0, 0] [1, %7, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_3, %alloca_0 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32>
+# CHECK-NEXT:         %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32>
+# CHECK-NEXT:         %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32>
+# CHECK-NEXT:         %10 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %alloca_0) -> (memref<1x1x12x3xf32>) {
+# CHECK-NEXT:           %11 = affine.max #map5(%3)
+# CHECK-NEXT:           %12 = affine.min #map6(%11, %7)
+# CHECK-NEXT:           %13 = affine.min #map7(%7, %12, %3)
+# CHECK-NEXT:           %14 = affine.max #map4(%13)
+# CHECK-NEXT:           %15 = arith.cmpi eq, %14, %c0 : index
+# CHECK-NEXT:           %16 = affine.max #map(%arg5)
+# CHECK-NEXT:           %17 = affine.max #map1(%arg5)
+# CHECK-NEXT:           %18 = affine.min #map2(%17)
+# CHECK-NEXT:           %19 = affine.min #map3(%18, %16)
+# CHECK-NEXT:           %20 = affine.max #map4(%19)
+# CHECK-NEXT:           %21 = arith.cmpi eq, %20, %c0 : index
+# CHECK-NEXT:           %22 = arith.ori %21, %15 : i1
+# CHECK-NEXT:           %23 = scf.if %22 -> (memref<1x1x1x3xf32>) {
+# CHECK-NEXT:             linalg.map outs(%alloca_4 : memref<1x1x1x3xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %24 = linalg.index 0 : index
+# CHECK-NEXT:                 %25 = linalg.index 1 : index
+# CHECK-NEXT:                 %26 = linalg.index 2 : index
+# CHECK-NEXT:                 %27 = linalg.index 3 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             scf.yield %alloca_4 : memref<1x1x1x3xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %subview_7 = memref.subview %subview_2[0, %12, %18, 0] [1, %14, %20, 3] [1, 1, 1, 1] : memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_8 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %subview_8, %alloca_5 : memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32>
+# CHECK-NEXT:             %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32>
+# CHECK-NEXT:             %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32>
+# CHECK-NEXT:             %24 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %alloca_5) -> (memref<1x1x1x3xf32>) {
+# CHECK-NEXT:               %25 = affine.min #map6(%11, %14)
+# CHECK-NEXT:               %26 = affine.min #map7(%14, %25, %3)
+# CHECK-NEXT:               %27 = affine.max #map4(%26)
+# CHECK-NEXT:               %28 = arith.cmpi eq, %27, %c0 : index
+# CHECK-NEXT:               %29 = affine.apply #map8(%3, %27)
+# CHECK-NEXT:               %30 = affine.max #map5(%16)
+# CHECK-NEXT:               %31 = affine.min #map6(%30, %20)
+# CHECK-NEXT:               %32 = affine.min #map7(%20, %31, %16)
+# CHECK-NEXT:               %33 = affine.max #map4(%32)
+# CHECK-NEXT:               %34 = arith.cmpi eq, %33, %c0 : index
+# CHECK-NEXT:               %35 = arith.ori %34, %28 : i1
+# CHECK-NEXT:               %36 = affine.apply #map8(%16, %33)
+# CHECK-NEXT:               %37 = scf.if %35 -> (memref<1x1x1x1xf32>) {
+# CHECK-NEXT:                 linalg.map outs(%alloca_9 : memref<1x1x1x1xf32>)
+# CHECK-NEXT:                   () {
+# CHECK-NEXT:                     %38 = linalg.index 0 : index
+# CHECK-NEXT:                     %39 = linalg.index 1 : index
+# CHECK-NEXT:                     %40 = linalg.index 2 : index
+# CHECK-NEXT:                     %41 = linalg.index 3 : index
+# CHECK-NEXT:                     linalg.yield %cst : f32
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                 scf.yield %alloca_9 : memref<1x1x1x1xf32>
+# CHECK-NEXT:               } else {
+# CHECK-NEXT:                 %subview_12 = memref.subview %subview_7[0, %25, %31, %arg7] [1, %27, %33, 1] [1, 1, 1, 1] : memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.map outs(%alloca_10 : memref<1x1x1x1xf32>)
+# CHECK-NEXT:                   () {
+# CHECK-NEXT:                     %38 = linalg.index 0 : index
+# CHECK-NEXT:                     %39 = linalg.index 1 : index
+# CHECK-NEXT:                     %40 = linalg.index 2 : index
+# CHECK-NEXT:                     %41 = linalg.index 3 : index
+# CHECK-NEXT:                     linalg.yield %cst : f32
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                 %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:                 %dim = memref.dim %subview_12, %c1_13 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %c2 = arith.constant 2 : index
+# CHECK-NEXT:                 %dim_14 = memref.dim %subview_12, %c2 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_15 = memref.subview %alloca_10[0, %3, %16, 0] [1, %dim, %dim_14, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_12, %subview_15 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>>
+# CHECK-NEXT:                 scf.yield %alloca_10 : memref<1x1x1x1xf32>
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_11 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %37, %subview_11 : memref<1x1x1x1xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>>
+# CHECK-NEXT:               scf.yield %arg8 : memref<1x1x1x3xf32>
+# CHECK-NEXT:             } {"./c"}
+# CHECK-NEXT:             scf.yield %24 : memref<1x1x1x3xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %23, %subview_6 : memref<1x1x1x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg6 : memref<1x1x12x3xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         scf.yield %10 : memref<1x1x12x3xf32>
+# CHECK-NEXT:       }
 # CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           linalg.fill {__xtc_id_pad_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>)
-# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         } {"./c"}
-# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %4, %subview_4 : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:       } {"./w"}
-# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %3, %subview_2 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %9, %subview_1 : memref<1x1x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x12x12x3xf32>
 # CHECK-NEXT:     } {"./h"}
-# CHECK-NEXT:     %subview = memref.subview %0[0, 2, 2, 0] [1, 8, 8, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
-# CHECK-NEXT:     memref.copy %arg0, %subview : memref<1x8x8x3xf32> to memref<1x8x8x3xf32, strided<[432, 36, 3, 1], offset: 78>>
 # CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) {
 # CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
@@ -475,13 +866,13 @@
 # CHECK-NEXT:       memref.copy %3, %subview_2 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<1x4x4x16xf32>
 # CHECK-NEXT:     } {"./h"}
-# CHECK-NEXT:     %subview_0 = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>>
+# CHECK-NEXT:     %subview = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>>
 # CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) {
-# CHECK-NEXT:       %3 = affine.apply #map(%arg3)
-# CHECK-NEXT:       %subview_1 = memref.subview %subview_0[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       %3 = affine.apply #map9(%arg3)
+# CHECK-NEXT:       %subview_1 = memref.subview %subview[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_2) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
-# CHECK-NEXT:         %5 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %5 = affine.apply #map9(%arg5)
 # CHECK-NEXT:         %subview_4 = memref.subview %subview_1[0, 0, %5, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:         %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
 # CHECK-NEXT:         %6 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_5) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
@@ -496,7 +887,7 @@
 # CHECK-NEXT:               %9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
 # CHECK-NEXT:                 %subview_14 = memref.subview %subview_12[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
 # CHECK-NEXT:                 %subview_15 = memref.subview %subview_13[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
-# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map10, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
 # CHECK-NEXT:                 ^bb0(%in: f32, %in_16: f32, %out: f32):
 # CHECK-NEXT:                   %10 = arith.mulf %in, %in_16 : f32
 # CHECK-NEXT:                   %11 = arith.addf %out, %10 : f32
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
index e882104ab..c5e42160d 100644
--- a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
@@ -37,18 +37,22 @@
 # CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%0 : tensor<16x16xf32>) -> tensor<16x16xf32>
-# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
-# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %padded = tensor.pad %arg0 nofold low[0, 0] high[2, 2] {
+# CHECK-NEXT:     ^bb0(%arg3: index, %arg4: index):
+# CHECK-NEXT:       tensor.yield %cst : f32
+# CHECK-NEXT:     } {__xtc_id_A_pad_} : tensor<14x14xf32> to tensor<16x16xf32>
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %3 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32>
-# CHECK-NEXT:     %inserted_slice_1 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
-# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %padded_1 = tensor.pad %arg1 nofold low[0, 0] high[2, 2] {
+# CHECK-NEXT:     ^bb0(%arg3: index, %arg4: index):
+# CHECK-NEXT:       tensor.yield %cst_0 : f32
+# CHECK-NEXT:     } {__xtc_id_B_pad_} : tensor<14x14xf32> to tensor<16x16xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst_2 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %5 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%4 : tensor<16x16xf32>) -> tensor<16x16xf32>
-# CHECK-NEXT:     %6 = linalg.matmul {__xtc_id_matmul_padded_} ins(%inserted_slice, %inserted_slice_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%5 : tensor<16x16xf32>) -> tensor<16x16xf32>
-# CHECK-NEXT:     %7 = tensor.empty() : tensor<14x14xf32>
-# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     %3 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %4 = linalg.matmul {__xtc_id_matmul_padded_} ins(%padded, %padded_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%3 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %5 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
 # CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
@@ -57,12 +61,12 @@
 # CHECK-NEXT:     transform.yield 
 # CHECK-NEXT:   }
 # CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_A_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
-# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_B_pad_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 # CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
 # CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -84,206 +88,437 @@
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c16 = arith.constant 16 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
-# CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } {__xtc_id_A_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
-# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %c0_1 = arith.constant 0 : index
 # CHECK-NEXT:     %c16_2 = arith.constant 16 : index
 # CHECK-NEXT:     %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
-# CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst_0 : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } {__xtc_id_B_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
-# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
-# CHECK-NEXT:     %cst_5 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
-# CHECK-NEXT:     %c16_7 = arith.constant 16 : index
-# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
-# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:     %6 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_6 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_13 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_16 : tensor<1x16xf32>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %c0_9 = arith.constant 0 : index
-# CHECK-NEXT:     %c16_10 = arith.constant 16 : index
-# CHECK-NEXT:     %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:     %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
-# CHECK-NEXT:       %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_16 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:         %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
-# CHECK-NEXT:         %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %c0_22 = arith.constant 0 : index
-# CHECK-NEXT:         %c16_23 = arith.constant 16 : index
-# CHECK-NEXT:         %c1_24 = arith.constant 1 : index
-# CHECK-NEXT:         %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) {
-# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:           %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_29 : tensor<1x1xf32>
+# CHECK-NEXT:     %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_9 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:     %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
+# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_15 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_16 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
+# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_21 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_27 : tensor<1x1xf32>
 # CHECK-NEXT:         } {"./k"}
-# CHECK-NEXT:         %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_25 : tensor<1x16xf32>
+# CHECK-NEXT:         %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_23 : tensor<1x16xf32>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_18 : tensor<16x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %7 = tensor.empty() : tensor<14x14xf32>
-# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     %9 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
 # CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %c0 = arith.constant 0 : index
 # CHECK-NEXT:     %c16 = arith.constant 16 : index
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %0) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
-# CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } {__xtc_id_A_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %inserted_slice = tensor.insert_slice %arg0 into %1[0, 0] [14, 14] [1, 1] {__xtc_id_A_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
-# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
 # CHECK-NEXT:     %c0_1 = arith.constant 0 : index
 # CHECK-NEXT:     %c16_2 = arith.constant 16 : index
 # CHECK-NEXT:     %c1_3 = arith.constant 1 : index
-# CHECK-NEXT:     %3 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %2) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_B_pad_0_} ins(%cst_0 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
-# CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst_0 : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } {__xtc_id_B_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %inserted_slice_4 = tensor.insert_slice %arg1 into %3[0, 0] [14, 14] [1, 1] {__xtc_id_B_pad_} : tensor<14x14xf32> into tensor<16x16xf32>
-# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
-# CHECK-NEXT:     %cst_5 = arith.constant 0.000000e+00 : f32
-# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
-# CHECK-NEXT:     %c16_7 = arith.constant 16 : index
-# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
-# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_6 to %c16_7 step %c1_8 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_13 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_14 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_15 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_13 to %c16_14 step %c1_15 iter_args(%arg6 = %extracted_slice_12) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %9 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_5 : f32) outs(%extracted_slice_17 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x16xf32>
+# CHECK-NEXT:     %6 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_6 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_13 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_16 : tensor<1x16xf32>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_16 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_16 : tensor<16x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %c0_9 = arith.constant 0 : index
-# CHECK-NEXT:     %c16_10 = arith.constant 16 : index
-# CHECK-NEXT:     %c1_11 = arith.constant 1 : index
-# CHECK-NEXT:     %6 = scf.for %arg3 = %c0_9 to %c16_10 step %c1_11 iter_args(%arg4 = %5) -> (tensor<16x16xf32>) {
-# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %inserted_slice[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %inserted_slice_4[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
-# CHECK-NEXT:       %extracted_slice_14 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:       %c0_15 = arith.constant 0 : index
-# CHECK-NEXT:       %c16_16 = arith.constant 16 : index
-# CHECK-NEXT:       %c1_17 = arith.constant 1 : index
-# CHECK-NEXT:       %8 = scf.for %arg5 = %c0_15 to %c16_16 step %c1_17 iter_args(%arg6 = %extracted_slice_14) -> (tensor<1x16xf32>) {
-# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
-# CHECK-NEXT:         %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
-# CHECK-NEXT:         %extracted_slice_21 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:         %c0_22 = arith.constant 0 : index
-# CHECK-NEXT:         %c16_23 = arith.constant 16 : index
-# CHECK-NEXT:         %c1_24 = arith.constant 1 : index
-# CHECK-NEXT:         %9 = scf.for %arg7 = %c0_22 to %c16_23 step %c1_24 iter_args(%arg8 = %extracted_slice_21) -> (tensor<1x1xf32>) {
-# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %extracted_slice_19[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_27 = tensor.extract_slice %extracted_slice_20[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %extracted_slice_28 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
-# CHECK-NEXT:           %10 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_26, %extracted_slice_27 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_28 : tensor<1x1xf32>) -> tensor<1x1xf32>
-# CHECK-NEXT:           %inserted_slice_29 = tensor.insert_slice %10 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
-# CHECK-NEXT:           scf.yield %inserted_slice_29 : tensor<1x1xf32>
+# CHECK-NEXT:     %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_9 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:     %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
+# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_15 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_16 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
+# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_21 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_27 : tensor<1x1xf32>
 # CHECK-NEXT:         } {"./k"}
-# CHECK-NEXT:         %inserted_slice_25 = tensor.insert_slice %9 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
-# CHECK-NEXT:         scf.yield %inserted_slice_25 : tensor<1x16xf32>
+# CHECK-NEXT:         %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_23 : tensor<1x16xf32>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %inserted_slice_18 = tensor.insert_slice %8 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
-# CHECK-NEXT:       scf.yield %inserted_slice_18 : tensor<16x16xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %7 = tensor.empty() : tensor<14x14xf32>
-# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %6[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     %9 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
 # CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 # CHECK-NEXT:  
 # CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (14, d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14, 1)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 0, 1)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (1, d0 - d1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (-d0 + 1)>
 # CHECK-NEXT: module attributes {transform.with_named_sequence} {
 # CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
 # CHECK-NEXT:     %c1 = arith.constant 1 : index
@@ -292,72 +527,170 @@
 # CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
 # CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
 # CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
+# CHECK-NEXT:     %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
 # CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_A_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = affine.min #map(%arg3)
+# CHECK-NEXT:       %5 = affine.min #map1(%4)
+# CHECK-NEXT:       %6 = arith.cmpi eq, %5, %c0 : index
+# CHECK-NEXT:       %7 = scf.if %6 -> (memref<1x16xf32>) {
+# CHECK-NEXT:         linalg.map outs(%alloca_1 : memref<1x16xf32>)
+# CHECK-NEXT:           () {
+# CHECK-NEXT:             %8 = linalg.index 0 : index
+# CHECK-NEXT:             %9 = linalg.index 1 : index
+# CHECK-NEXT:             linalg.yield %cst : f32
+# CHECK-NEXT:           }
+# CHECK-NEXT:         scf.yield %alloca_1 : memref<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %subview_7 = memref.subview %arg0[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref<?x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:         %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_8, %alloca_2 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32>
+# CHECK-NEXT:         %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_2) -> (memref<1x16xf32>) {
+# CHECK-NEXT:           %9 = affine.min #map2(%4)
+# CHECK-NEXT:           %10 = affine.min #map3(%5, %9)
+# CHECK-NEXT:           %11 = arith.cmpi eq, %10, %c0 : index
+# CHECK-NEXT:           %12 = affine.apply #map4(%10)
+# CHECK-NEXT:           %13 = affine.min #map(%arg5)
+# CHECK-NEXT:           %14 = affine.min #map1(%13)
+# CHECK-NEXT:           %15 = arith.cmpi eq, %14, %c0 : index
+# CHECK-NEXT:           %16 = arith.ori %15, %11 : i1
+# CHECK-NEXT:           %17 = affine.apply #map4(%14)
+# CHECK-NEXT:           %18 = scf.if %16 -> (memref<1x1xf32>) {
+# CHECK-NEXT:             linalg.map outs(%alloca_9 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             scf.yield %alloca_9 : memref<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref<?x14xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             linalg.map outs(%alloca_10 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:             %dim = memref.dim %subview_12, %c0_13 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:             %dim_15 = memref.dim %subview_12, %c1_14 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             memref.copy %subview_12, %subview_16 : memref<?x?xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             scf.yield %alloca_10 : memref<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg6 : memref<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         scf.yield %8 : memref<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview = memref.subview %0[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %arg0, %subview : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
-# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_1) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_B_pad_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
+# CHECK-NEXT:     %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_3) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %4 = affine.min #map(%arg3)
+# CHECK-NEXT:       %5 = affine.min #map1(%4)
+# CHECK-NEXT:       %6 = arith.cmpi eq, %5, %c0 : index
+# CHECK-NEXT:       %7 = scf.if %6 -> (memref<1x16xf32>) {
+# CHECK-NEXT:         linalg.map outs(%alloca_4 : memref<1x16xf32>)
+# CHECK-NEXT:           () {
+# CHECK-NEXT:             %8 = linalg.index 0 : index
+# CHECK-NEXT:             %9 = linalg.index 1 : index
+# CHECK-NEXT:             linalg.yield %cst : f32
+# CHECK-NEXT:           }
+# CHECK-NEXT:         scf.yield %alloca_4 : memref<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %subview_7 = memref.subview %arg1[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref<?x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:         %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_8, %alloca_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32>
+# CHECK-NEXT:         %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_5) -> (memref<1x16xf32>) {
+# CHECK-NEXT:           %9 = affine.min #map2(%4)
+# CHECK-NEXT:           %10 = affine.min #map3(%5, %9)
+# CHECK-NEXT:           %11 = arith.cmpi eq, %10, %c0 : index
+# CHECK-NEXT:           %12 = affine.apply #map4(%10)
+# CHECK-NEXT:           %13 = affine.min #map(%arg5)
+# CHECK-NEXT:           %14 = affine.min #map1(%13)
+# CHECK-NEXT:           %15 = arith.cmpi eq, %14, %c0 : index
+# CHECK-NEXT:           %16 = arith.ori %15, %11 : i1
+# CHECK-NEXT:           %17 = affine.apply #map4(%14)
+# CHECK-NEXT:           %18 = scf.if %16 -> (memref<1x1xf32>) {
+# CHECK-NEXT:             linalg.map outs(%alloca_9 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             scf.yield %alloca_9 : memref<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref<?x14xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             linalg.map outs(%alloca_10 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:             %dim = memref.dim %subview_12, %c0_13 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:             %dim_15 = memref.dim %subview_12, %c1_14 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             memref.copy %subview_12, %subview_16 : memref<?x?xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             scf.yield %alloca_10 : memref<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg6 : memref<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         scf.yield %8 : memref<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview_2 = memref.subview %1[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %arg1, %subview_2 : memref<14x14xf32> to memref<14x14xf32, strided<[16, 1]>>
 # CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_4 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_4) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_6 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
-# CHECK-NEXT:         %subview_7 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %subview_6, %subview_7 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_6) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_8 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_8, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_7 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
 # CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %2) -> (memref<16x16xf32>) {
-# CHECK-NEXT:       %subview_4 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %subview_5 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_5) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:         %subview_7 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_8) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) {
-# CHECK-NEXT:           %subview_10 = memref.subview %subview_4[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           %subview_11 = memref.subview %subview_7[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_10, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:       %subview_6 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_7) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_9 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_12 = memref.subview %subview_6[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_13 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
 # CHECK-NEXT:           scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         } {"./k"}
-# CHECK-NEXT:         %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:         memref.copy %5, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       } {"./j"}
-# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
-# CHECK-NEXT:       memref.copy %4, %subview_6 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_8 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
 # CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
 # CHECK-NEXT:     } {"./i"}
-# CHECK-NEXT:     %subview_3 = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
-# CHECK-NEXT:     memref.copy %subview_3, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
+# CHECK-NEXT:     %subview = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %subview, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
 # CHECK-NEXT:     return
 # CHECK-NEXT:   }
 # CHECK-NEXT: }