diff --git a/src/xtc/backends/mlir/MlirCompiler.py b/src/xtc/backends/mlir/MlirCompiler.py
index b4c9bfe3..8a761f87 100644
--- a/src/xtc/backends/mlir/MlirCompiler.py
+++ b/src/xtc/backends/mlir/MlirCompiler.py
@@ -20,6 +20,7 @@
 from xtc.backends.mlir.MlirCompilerPasses import (
     MlirProgramInsertTransformPass,
     MlirProgramApplyTransformPass,
+    apply_bufferization_passes,
 )
 
 from xtc.backends.mlir.MlirTarget import (
@@ -149,6 +150,15 @@ def mlir_apply_transform_pass(self) -> None:
         if self._config.print_transformed_ir:
             self.dump_ir("IR Dump After transform")
 
+    def mlir_apply_tensor_lowering_pass(self) -> None:
+        if self._config.print_bufferization_ir:
+            self.dump_ir("IR Dump Before Tensor Lowering")
+
+        apply_bufferization_passes(self._mlir_program)
+
+        if self._config.print_bufferization_ir:
+            self.dump_ir("IR Dump After Tensor Lowering")
+
     def _save_temp(self, fname: str, content: Any) -> None:
         if not self._config.save_temps:
             return
@@ -196,4 +206,6 @@ def compile(self) -> None:
         self.mlir_apply_transform_pass()
         save_temp(mlir_atrn_dump_file, self._mlir_program.mlir_module)
 
+        self.mlir_apply_tensor_lowering_pass()
+
         self._target.generate_code_for_target(self._mlir_program, dump_file=dump_file)
diff --git a/src/xtc/backends/mlir/MlirCompilerPasses.py b/src/xtc/backends/mlir/MlirCompilerPasses.py
index de33ff28..08835ade 100644
--- a/src/xtc/backends/mlir/MlirCompilerPasses.py
+++ b/src/xtc/backends/mlir/MlirCompilerPasses.py
@@ -534,3 +534,44 @@ def run(self) -> None:
                 transform_op.erase()
             else:
                 break
+
+
+class MlirProgramApplyPasses:
+    def __init__(
+        self,
+        mlir_program: RawMlirProgram,
+    ) -> None:
+        self._mlir_program = mlir_program
+
+    def run(self, pass_names: list[str]) -> None:
+        ctx = self._mlir_program.mlir_context
+        pm = PassManager(context=ctx)
+        for name in pass_names:
+            pm.add(name)  # type: ignore # no attribute add
+        pm.run(self._mlir_program.mlir_module.operation)
+
+
+def apply_bufferization_passes(mlir_program: RawMlirProgram):
+    apply_passes = MlirProgramApplyPasses(mlir_program)
+    bufferize_options = [
+        "bufferize-function-boundaries=1",
+        "function-boundary-type-conversion=identity-layout-map",
+        "buffer-alignment=256",
+    ]
+    apply_passes.run(
+        [
+            "canonicalize",
+            "cse",
+            "eliminate-empty-tensors",  # causes ops to write directly to out buffer
+            f"one-shot-bufferize{{{' '.join(bufferize_options)}}}",
+            "func.func(buffer-hoisting)",
+            "func.func(buffer-loop-hoisting)",
+            "drop-equivalent-buffer-results",
+            "func.func(promote-buffers-to-stack)",
+        ]
+    )
+
+
+def pre_transform_tensor_passes(mlir_program: RawMlirProgram):
+    apply_passes = MlirProgramApplyPasses(mlir_program)
+    # apply_passes.run(["eliminate-empty-tensors"])
diff --git a/src/xtc/backends/mlir/MlirConfig.py b/src/xtc/backends/mlir/MlirConfig.py
index 2d0ab512..65345681 100644
--- a/src/xtc/backends/mlir/MlirConfig.py
+++ b/src/xtc/backends/mlir/MlirConfig.py
@@ -22,6 +22,7 @@ class MlirConfig:
     print_assembly: bool = False
     visualize_jumps: bool = True
     print_lowered_ir: bool = False
+    print_bufferization_ir: bool = False
     debug: bool = False
     color: bool = False
     concluding_passes: list[str] = field(default_factory=list)
diff --git a/src/xtc/backends/mlir/MlirGraphBackend.py b/src/xtc/backends/mlir/MlirGraphBackend.py
index 191cad02..ab973fb6 100644
--- a/src/xtc/backends/mlir/MlirGraphBackend.py
+++ b/src/xtc/backends/mlir/MlirGraphBackend.py
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024-2026 The XTC Project Authors
 #
-from typing import cast, Any
+from typing import cast, Any, Type
 from typing_extensions import override
 
 from xdsl.dialects.func import FuncOp as xdslFuncOp
-from xdsl.dialects import func, memref
-from xdsl.dialects.builtin import MemRefType, f32, f64
+from xdsl.dialects import func, memref, tensor, bufferization
+from xdsl.dialects.builtin import MemRefType, TensorType, f32, f64, UnitAttr
 from xdsl.ir import Region, Block, Operation
 from xdsl.builder import ImplicitBuilder
 
@@ -28,7 +28,11 @@ def __init__(
         concluding_passes: list[str] = [],
         always_vectorize: bool = False,
         no_alias: bool = True,
+        use_tensor_dialect: bool = False,
     ):
+        self.xdsl_type: Type[TensorType] | Type[MemRefType] = (
+            TensorType if use_tensor_dialect else MemRefType
+        )
         if isinstance(xdsl_func, XTCGraph):
             assert nodes is None
             graph = xdsl_func
@@ -62,13 +66,24 @@ def _init_from_xdsl(
     def _xdsl_generate_node(
         self, node: XTCNode, block: Block, variables: dict[str, Any]
     ):
-        operation = MlirOperation.from_operation(node.operation, name=node.name)
+        operation = MlirOperation.from_operation(
+            node.operation,
+            name=node.name,
+            op_type=self.xdsl_type,  # type: ignore
+        )
         names = [*node.inputs, *node.outputs]
         assert node.inputs_types is not None and node.outputs_types is not None
         types = [*node.inputs_types, *node.outputs_types]
         for name, type in zip(names, types):
+            if name in node.outputs and self.xdsl_type == TensorType:
+                with ImplicitBuilder(block):
+                    variables[name] = tensor.EmptyOp(
+                        dynamic_sizes=[],
+                        tensor_type=self._xdsl_type_from_tensortype(type),
+                    ).results[0]
             if name in variables:
                 continue
+            assert self.xdsl_type != TensorType
             with ImplicitBuilder(block):
                 elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
                 alloca = memref.AllocaOp.get(
@@ -79,6 +94,11 @@ def _xdsl_generate_node(
             variables[name] = alloca.results[0]
         args = [variables[name] for name in names]
         _, attrs = operation.generate(block=block, args=args)
+        # the tensor dialect needs the result of the op, not the alloca
+        if self.xdsl_type == TensorType:
+            assert len(node.outputs) == len(attrs["output_nodes"])
+            for name, output in zip(node.outputs, attrs["output_nodes"]):
+                variables[name] = output.results[0]
         return attrs
 
     def _init_from_graph(
@@ -95,18 +115,34 @@ def _init_from_graph(
         )
         params_types = [
             self._xdsl_type_from_tensortype(cast(XTCTensorType, tensor_type))
-            for tensor_type in [*inputs_types, *outputs_types]
+            for tensor_type in inputs_types
         ]
+        # graph output types are always memrefs
+        params_types.extend(
+            self._memref_type_from_tensortype(cast(XTCTensorType, tensor_type))
+            for tensor_type in outputs_types
+        )
         inlined_block = Block(arg_types=params_types)
         variables = {
             name: arg
             for name, arg in zip([*graph.inputs, *graph.outputs], inlined_block.args)
         }
         block_attrs = []
+
         for node in graph.nodes.values():
             node_attrs = self._xdsl_generate_node(node, inlined_block, variables)
             block_attrs.append(node_attrs)
         with ImplicitBuilder(inlined_block):
+            if self.xdsl_type == TensorType:
+                # write the final tensor values to the output buffers
+                for name, out_arg in zip(
+                    graph.outputs, inlined_block.args[-len(graph.outputs) :]
+                ):
+                    bufferization.MaterializeInDestinationOp(
+                        operands=((variables[name],), (out_arg,)),
+                        result_types=((),),
+                        attributes={"writable": UnitAttr(), "restrict": UnitAttr()},
+                    )
             func.ReturnOp()
         region = Region([inlined_block])  # type: ignore # issue with mypy
         payload = xdslFuncOp.from_region(
@@ -128,6 +164,7 @@ def _init_from_graph(
                     always_vectorize=always_vectorize,
                     concluding_passes=concluding_passes,
                     id=f"__xtc_id_{node_id}_",
+                    xdsl_type=self.xdsl_type,
                 )
         return payload, nodes_dict
 
@@ -136,11 +173,15 @@ def _xdsl_elt_shape_from_tensortype(self, type: XTCTensorType) -> tuple[Any, Any
         return (elt_type, type.constant_shape)
 
     def _xdsl_type_from_tensortype(self, type: XTCTensorType) -> Any:
+        elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
+        return self.xdsl_type(elt_type, shape)
+
+    def _memref_type_from_tensortype(self, type: XTCTensorType) -> Any:
         elt_type, shape = self._xdsl_elt_shape_from_tensortype(type)
         return MemRefType(elt_type, shape)
 
     def _np_types_spec(
-        self, types: list[MemRefType]
+        self, types: list[MemRefType] | list[TensorType]
     ) -> list[dict[str, tuple[int, ...] | str]]:
         types_map = {"f32": "float32", "f64": "float64"}
         types_spec: list[dict[str, tuple[int, ...] | str]] = [
@@ -156,12 +197,12 @@ def _np_types_spec(
     def np_inputs_spec(self) -> list[dict[str, Any]]:
         # Assume inputs are first, and output is single last param
         inputs_args_types = [arg.type for arg in self.xdsl_func.args[:-1]]
-        list_memref_tys = cast(list[MemRefType], inputs_args_types)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], inputs_args_types)  # type: ignore
+        return self._np_types_spec(list_xdsl_tys)
 
     @override
     def np_outputs_spec(self) -> list[dict[str, Any]]:
         # Assume inputs are first, and output is single last param
         outputs_args_types = [arg.type for arg in self.xdsl_func.args[-1:]]
-        list_memref_tys = cast(list[MemRefType], outputs_args_types)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[MemRefType], outputs_args_types)
+        return self._np_types_spec(list_xdsl_tys)
diff --git a/src/xtc/backends/mlir/MlirNodeBackend.py b/src/xtc/backends/mlir/MlirNodeBackend.py
index 135e99b8..f809e392 100644
--- a/src/xtc/backends/mlir/MlirNodeBackend.py
+++ b/src/xtc/backends/mlir/MlirNodeBackend.py
@@ -2,11 +2,11 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024-2026 The XTC Project Authors
 #
-from typing import cast, Any
+from typing import cast, Any, Type
 from typing_extensions import override
 
 from xdsl.ir import Operation as xdslOperation
-from xdsl.dialects.builtin import MemRefType as xdslAnyMemRefType
+from xdsl.dialects.builtin import MemRefType, TensorType
 from xdsl.dialects.builtin import UnitAttr as xdslUnitAttr
 from xtc.utils.xdsl_aux import xdsl_operator_to_function
 
@@ -26,8 +26,10 @@ def __init__(
         always_vectorize: bool = False,
         no_alias: bool = True,
         id: str | None = None,
+        xdsl_type: Type[TensorType] | Type[MemRefType] = MemRefType,
     ):
         self._graph = None
+        self.xdsl_type = xdsl_type
         if id is None:
             self.op_id_attribute = f"__id{MlirNodeBackend.count}__"
             MlirNodeBackend.count += 1
@@ -48,7 +50,7 @@ def __init__(
         self.loop_stamps = loop_stamps
 
     def _np_types_spec(
-        self, types: list[xdslAnyMemRefType]
+        self, types: list[MemRefType | TensorType]
     ) -> list[dict[str, tuple[int, ...] | str]]:
         types_map = {"f32": "float32", "f64": "float64"}
         types_spec: list[dict[str, tuple[int, ...] | str]] = [
@@ -63,11 +65,11 @@ def _np_types_spec(
     @override
     def np_inputs_spec(self) -> list[dict[str, Any]]:
         list_attr_tys = [i.type for i in self.source_op.inputs]  # type: ignore
-        list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)  # type: ignore
+        return self._np_types_spec(list_xdsl_tys)
 
     @override
     def np_outputs_spec(self) -> list[dict[str, Any]]:
         list_attr_tys = [i.type for i in self.source_op.outputs]  # type: ignore
-        list_memref_tys = cast(list[xdslAnyMemRefType], list_attr_tys)
-        return self._np_types_spec(list_memref_tys)
+        list_xdsl_tys = cast(list[self.xdsl_type], list_attr_tys)  # type: ignore
+        return self._np_types_spec(list_xdsl_tys)
diff --git a/src/xtc/backends/mlir/MlirOps.py b/src/xtc/backends/mlir/MlirOps.py
index be687b44..48e0a2f1 100644
--- a/src/xtc/backends/mlir/MlirOps.py
+++ b/src/xtc/backends/mlir/MlirOps.py
@@ -7,9 +7,11 @@
 from typing_extensions import override
 from typing import Any, Type, TypeAlias, cast
 
-from xdsl.dialects import linalg, arith, builtin, memref
+from xdsl.dialects import linalg, arith, builtin, memref, tensor
 from xdsl.dialects.builtin import (
     MemRefType,
+    TensorType,
+    IndexType,
     f32,
     f64,
     i64,
@@ -42,8 +44,9 @@ def __init__(
         args: tuple[Any, ...],
         attrs: dict[str, Any] = {},
         name: str | None = None,
+        op_type: Type[MemRefType] | Type[TensorType] = MemRefType,
     ) -> None:
-        self.operator = operator(args, attrs, name=name)
+        self.operator = operator(args, attrs, name=name, op_type=op_type)
         self.args = args
         self.attrs = attrs
         self.name = self.operator.name if name is None else name
@@ -78,7 +81,12 @@ def np_outputs_spec(self) -> list[dict[str, Any]]:
         return outputs_spec
 
     @classmethod
-    def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation":
+    def from_operation(
+        cls,
+        xtc_op: Operation,
+        name: str | None,
+        op_type: Type[MemRefType] | Type[TensorType],
+    ) -> "MlirOperation":
         dims = xtc_op.dims.values()
         dtype = xtc_op.inputs_types[0].dtype  # TODO: currently get dtype from 1st arg
         args = tuple([*dims, dtype])
@@ -88,6 +96,7 @@ def from_operation(cls, xtc_op: Operation, name: str | None) -> "MlirOperation":
             args,
             dict(attrs),
             name=name,
+            op_type=op_type,
         )
 
 
@@ -97,11 +106,16 @@ class MlirOperator(ABC):
     KINDS = ""
 
     def __init__(
-        self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None
+        self,
+        args: tuple[Any, ...],
+        attrs: dict[str, Any],
+        name: str | None = None,
+        op_type: Type[MemRefType] | Type[TensorType] = MemRefType,
     ) -> None:
         self.args = args
         self.attrs = {**attrs}
         self.name = name if name is not None else self.DEFAULT_NAME
+        self.op_type = op_type
 
     @abstractmethod
     def generate_op(
@@ -149,23 +163,27 @@ def generate_op(
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape) for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
+                self.op_type(elt_type, shape)
+                for shape in [[Ki, Kk], [Kk, Kj], [Ki, Kj]]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 3
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
+            result = (args[2].type,) if self.op_type == TensorType else ()
             fill = linalg.FillOp(
-                res=(),
+                res=result,
                 inputs=(cst0.results[0],),
                 outputs=(args[2],),
             )
             reduce = linalg.MatmulOp(
-                res=(),
+                res=result,
                 inputs=(args[0], args[1]),
-                outputs=(args[2],),
+                outputs=(fill.results[0],)
+                if self.op_type == TensorType
+                else (args[2],),
             )
         fill_node_id = f"{self.name}_0"
         reduce_node_id = f"{self.name}"
@@ -180,6 +198,7 @@ def generate_op(
                 {"i": Ki, "j": Kj},
                 self.dims_sizes(),
             ],
+            "output_nodes": [reduce],
         }
         return block, attrs
 
@@ -223,10 +242,14 @@ class MlirOperatorConv2D(MlirOperator):
     DEFAULT_STRIDE = (1, 1)
 
     def __init__(
-        self, args: tuple[Any, ...], attrs: dict[str, Any], name: str | None = None
+        self,
+        args: tuple[Any, ...],
+        attrs: dict[str, Any],
+        name: str | None = None,
+        op_type: Type[MemRefType] | Type[TensorType] = MemRefType,
     ) -> None:
         attrs = {"stride": self.DEFAULT_STRIDE, **attrs}
-        super().__init__(args, attrs, name)
+        super().__init__(args, attrs, name, op_type)
 
     @override
     def dims(self, kind: str = "") -> tuple[str, ...]:
@@ -250,16 +273,17 @@ def generate_op(
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape) for shape in [*inps_dims, out_dims]
+                self.op_type(elt_type, shape) for shape in [*inps_dims, out_dims]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 3
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         with ImplicitBuilder(block):
+            result = (args[2].type,) if self.op_type == TensorType else ()
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
             fill = linalg.FillOp(
-                res=(),
+                res=result,
                 inputs=(cst0.results[0],),
                 outputs=(args[2],),
             )
@@ -282,7 +306,9 @@ def generate_op(
                 linalg.YieldOp(add)
             reduce = linalg.GenericOp(
                 inputs=(args[0], args[1]),
-                outputs=(args[2],),
+                outputs=(fill.results[0],)
+                if self.op_type == TensorType
+                else (args[2],),
                 body=Region([block_in]),  # type: ignore # mypy issue with dataclass
                 # ignore typing due to xdsl hints limitation
                 indexing_maps=[
@@ -306,6 +332,7 @@ def generate_op(
                     ),
                 ],
                 iterator_types=iterator_types,
+                result_types=result,
             )
         fill_node_id = f"{self.name}_0"
         reduce_node_id = f"{self.name}"
@@ -320,6 +347,7 @@ def generate_op(
                 {"b": Kb, "h": Kh, "w": Kw, "f": Kf},
                 self.dims_sizes(),
             ],
+            "output_nodes": [reduce],
         }
         return block, attrs
 
@@ -367,13 +395,14 @@ def generate_op(
         elt_type = {"float32": f32, "float64": f64}[dtype]
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
-            ops_types = [MemRefType(elt_type, shape) for shape in [[Ki], [Ki]]]
+            ops_types = [self.op_type(elt_type, shape) for shape in [[Ki], [Ki]]]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 2
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         inp_shape, out_shape = [
-            list(cast(MemRefType, arg.type).get_shape()) for arg in args
+            list(cast(self.op_type, arg.type).get_shape())  # type: ignore
+            for arg in args
         ]
         inp_size, out_size = [mulall(shape) for shape in [inp_shape, out_shape]]
         assert inp_size == out_size
@@ -392,50 +421,58 @@ def generate_op(
                     )
                 ]
             )
-            inp = memref.CollapseShapeOp(
-                operands=[args[0]],
-                properties=dict(reassociation=inp_reassociation),
-                result_types=[MemRefType(elt_type, (inp_size,))],
-            )
-            out = memref.CollapseShapeOp(
-                operands=[args[1]],
-                properties=dict(reassociation=out_reassociation),
-                result_types=[MemRefType(elt_type, (out_size,))],
-            )
+            if self.op_type == TensorType:
+                out_operand = args[1]
+                inp_operand = args[0]
+                rank = len(out_shape)
+                iterator_types = [StringAttr("parallel")] * rank
+                indexing_maps = [
+                    AffineMapAttr(AffineMap.identity(rank)),  # input
+                    AffineMapAttr(
+                        AffineMap.identity(rank).drop_results(out_shape)
+                    ),  # scalar
+                    AffineMapAttr(AffineMap.identity(rank)),  # output
+                ]
+            else:
+                inp = memref.CollapseShapeOp(  # type: ignore
+                    operands=[args[0]],
+                    properties=dict(reassociation=inp_reassociation),
+                    result_types=[self.op_type(elt_type, (inp_size,))],
+                )
+                inp_operand = inp.results[0]  # type: ignore
+                out = memref.CollapseShapeOp(
+                    operands=[args[1]],
+                    properties=dict(reassociation=out_reassociation),
+                    result_types=[self.op_type(elt_type, (out_size,))],
+                )
+                out_operand = out.results[0]  # type: ignore
+                iterator_types = [
+                    StringAttr({"P": "parallel", "R": "reduction"}[k])
+                    for k in self.KINDS
+                ]
+                # ignore typing due to xdsl hints limitation
+                indexing_maps = [
+                    AffineMapAttr(AffineMap.from_callable(lambda i: (i,))),  # type: ignore
+                    AffineMapAttr(AffineMap.from_callable(lambda _: ())),  # type: ignore
+                    AffineMapAttr(AffineMap.from_callable(lambda i: (i,))),  # type: ignore
+                ]
+                iterator_types = [
+                    StringAttr({"P": "parallel", "R": "reduction"}[k])
+                    for k in self.KINDS
+                ]
+            result = (args[1].type,) if self.op_type == TensorType else ()
             cst0 = arith.ConstantOp(builtin.FloatAttr(0, elt_size))
-            iterator_types = [
-                StringAttr({"P": "parallel", "R": "reduction"}[k]) for k in self.KINDS
-            ]
             block_in = Block(arg_types=[f32, f32, f32])
             with ImplicitBuilder(block_in):
                 max = arith.MaximumfOp(block_in.args[0], block_in.args[1])
                 linalg.YieldOp(max)
             relu = linalg.GenericOp(
-                inputs=(inp.results[0], cst0.results[0]),
-                outputs=(out.results[0],),
+                inputs=(inp_operand, cst0.results[0]),
+                outputs=(out_operand,),
                 body=Region([block_in]),  # type: ignore # mypy issue with dataclass
-                # ignore typing due to xdsl hints limitation
-                indexing_maps=[
-                    AffineMapAttr(
-                        AffineMap.from_callable(
-                            lambda i:  # type: ignore
-                            (i,)
-                        )
-                    ),
-                    AffineMapAttr(
-                        AffineMap.from_callable(
-                            lambda _:  # type: ignore
-                            ()
-                        )
-                    ),
-                    AffineMapAttr(
-                        AffineMap.from_callable(
-                            lambda i:  # type: ignore
-                            (i,)
-                        )
-                    ),
-                ],
+                indexing_maps=indexing_maps,
                 iterator_types=iterator_types,
+                result_types=result,
             )
         relu_node_id = f"{self.name}"
         relu.attributes[f"__xtc_id_{relu_node_id}_"] = UnitAttr()
@@ -446,6 +483,7 @@ def generate_op(
             "dims_sizes": [
                 self.dims_sizes(),
             ],
+            "output_nodes": [relu],
         }
         return block, attrs
 
@@ -492,25 +530,31 @@ def generate_op(
         dims_value = list(self.args[:-1])
         padding = self.attrs["padding"]
         constant_value = self.attrs["constant_value"]
+        lows = [0] * len(dims_value)
+        highs = [0] * len(dims_value)
         if isinstance(padding, dict):
             dims_value_before_pad = list(dims_value)
             for i, pad_value in padding.items():
                 dims_value_before_pad[i] -= sum(pad_value)
+                lows[i] = pad_value[0]
+                highs[i] = pad_value[1]
         else:
             dims_value_before_pad = [
                 dim_value - sum(padding) for dim_value in dims_value
             ]
+            lows = [padding[0] for d in dims_value]
+            highs = [padding[1] for d in dims_value]
         elt_type = {"float32": f32, "float64": f64}[dtype]
         elt_size = {"float32": 32, "float64": 64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape)
+                MemRefType(elt_type, shape)  # should be op_type here??
                 for shape in [dims_value_before_pad, dims_value]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 2
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         if isinstance(padding, dict):
             offsets = [0 for _ in self.args[:-1]]
             for i, (pad_b, pad_a) in padding.items():
@@ -519,38 +563,57 @@ def generate_op(
             offsets = [padding[0] for _ in self.args[:-1]]
         sizes = list(dims_value_before_pad)
         strides = [1 for _ in self.args[:-1]]
+        using_tensors = self.op_type == TensorType
         with ImplicitBuilder(block):
             cst0 = arith.ConstantOp(builtin.FloatAttr(constant_value, elt_size))
-            fill = linalg.FillOp(
-                res=(),
-                inputs=(cst0.results[0],),
-                outputs=(args[1],),
-            )
-            subview = memref.SubviewOp.from_static_parameters(
-                source=args[1],
-                source_type=args[1].type,  # type: ignore
-                offsets=offsets,
-                sizes=sizes,
-                strides=strides,
-            )
-            copy = linalg.CopyOp(
-                inputs=[args[0]],
-                outputs=[subview.result],
-                res=(),
-            )
-        fill_node_id = f"{self.name}_0"
-        fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
+            result = (args[1].type,) if using_tensors else ()
+            fill_node_id = f"{self.name}_0"
+            if using_tensors:
+                fill = None
+                block_in = Block(arg_types=[IndexType()] * len(dims_value))
+                with ImplicitBuilder(block_in):
+                    tensor.YieldOp(cst0)
+                copy = tensor.PadOp(
+                    source=args[0],
+                    region=Region([block_in]),
+                    low=[],
+                    high=[],
+                    nofold=UnitAttr(),
+                    result_type=TensorType(elt_type, dims_value),
+                    static_low=lows,
+                    static_high=highs,
+                )
+            else:
+                fill = linalg.FillOp(
+                    res=result,
+                    inputs=(cst0.results[0],),
+                    outputs=(args[1],),
+                )
+                subview = memref.SubviewOp.from_static_parameters(
+                    source=args[1],
+                    source_type=args[1].type,  # type: ignore
+                    offsets=offsets,
+                    sizes=sizes,
+                    strides=strides,
+                )
+                copy = linalg.CopyOp(  # type: ignore
+                    inputs=[args[0]],
+                    outputs=[subview.result],
+                    res=result,
+                )
+                fill.attributes[f"__xtc_id_{fill_node_id}_"] = UnitAttr()
         copy_node_id = f"{self.name}"
         copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr()
         attrs = {
             "nodes_map": {
-                fill_node_id: fill,
+                **({fill_node_id: fill} if fill else {}),
                 copy_node_id: copy,
             },
             "dims_sizes": [
                 self.dims_sizes(),
-                self.dims_sizes(),
+                *([] if using_tensors else [self.dims_sizes()]),
             ],
+            "output_nodes": [copy],
         }
         return block, attrs
 
@@ -618,13 +681,13 @@ def generate_op(
         elt_type = {"float32": f32, "float64": f64}[dtype]
         if block is None:
             ops_types = [
-                MemRefType(elt_type, shape)
+                self.op_type(elt_type, shape)
                 for shape in [dims_values_before_unpad, dims_values]
             ]
             block = Block(arg_types=ops_types)
             args = block.args
         assert len(args) == 2
-        assert all(isinstance(arg.type, MemRefType) for arg in args)
+        assert all(isinstance(arg.type, self.op_type) for arg in args)
         if isinstance(padding, dict):
             offsets = [0 for _ in self.args[:-1]]
             for i, (pad_b, _) in padding.items():
@@ -633,28 +696,36 @@ def generate_op(
             offsets = [padding[0] for _ in self.args[:-1]]
         sizes = dims_values
         strides = [1 for _ in self.args[:-1]]
+        using_tensors = self.op_type == TensorType
         with ImplicitBuilder(block):
-            subview = memref.SubviewOp.from_static_parameters(
-                source=args[0],
-                source_type=args[0].type,  # type: ignore
-                offsets=offsets,
-                sizes=sizes,
-                strides=strides,
-            )
-            copy = linalg.CopyOp(
-                inputs=[subview.result],
-                outputs=[args[1]],
-                res=(),
-            )
+            if using_tensors:
+                copy = tensor.ExtractSliceOp.from_static_parameters(
+                    source=args[0],
+                    offsets=offsets,
+                    sizes=sizes,
+                    strides=strides,
+                )
+            else:
+                subview = memref.SubviewOp.from_static_parameters(
+                    source=args[0],
+                    source_type=args[0].type,  # type: ignore
+                    offsets=offsets,
+                    sizes=sizes,
+                    strides=strides,
+                )
+                copy = linalg.CopyOp(  # type: ignore
+                    inputs=[subview.result],
+                    outputs=[args[1]],
+                    res=(),
+                )
         copy_node_id = f"{self.name}"
         copy.attributes[f"__xtc_id_{copy_node_id}_"] = UnitAttr()
         attrs = {
             "nodes_map": {
-                copy_node_id: copy,
+                copy_node_id: None if using_tensors else copy,
             },
-            "dims_sizes": [
-                self.dims_sizes(),
-            ],
+            "dims_sizes": [*([] if using_tensors else [self.dims_sizes()])],
+            "output_nodes": [copy],
         }
         return block, attrs
 
diff --git a/src/xtc/utils/xdsl_aux.py b/src/xtc/utils/xdsl_aux.py
index c339f02d..0061ccba 100644
--- a/src/xtc/utils/xdsl_aux.py
+++ b/src/xtc/utils/xdsl_aux.py
@@ -12,6 +12,7 @@
 from xdsl.dialects.arith import ConstantOp
 from xdsl.dialects.builtin import (
     MemRefType,
+    TensorType,
     IntegerAttr,
     FloatAttr,
     IntegerType,
@@ -19,7 +20,7 @@
 
 from xdsl.context import Context
 from xdsl.parser import Parser
-from xdsl.dialects import func, linalg, arith, memref
+from xdsl.dialects import func, linalg, arith, memref, tensor
 from xdsl.dialects.builtin import ModuleOp
 
 
@@ -29,6 +30,7 @@ def parse_xdsl_module(source: str) -> ModuleOp:
     context.load_dialect(linalg.Linalg)
     context.load_dialect(arith.Arith)
     context.load_dialect(memref.MemRef)
+    context.load_dialect(tensor.Tensor)
     parser = Parser(context, source)
     module = parser.parse_module()
     return module
@@ -39,7 +41,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp:
     operands = source_op.operands
     shaped_types, scalar_types = [], []
     for o in operands:
-        if isa(o.type, MemRefType):
+        if isa(o.type, MemRefType) or isa(o.type, TensorType):
             shaped_types.append(o.type)
         else:
             scalar_types.append(o.type)
@@ -49,7 +51,7 @@ def xdsl_operator_to_function(source_op: Operation, name: str) -> func.FuncOp:
     concrete_operands = []
     shaped_count, scalar_count = 0, 0
     for o in operands:
-        if isa(o.type, MemRefType):
+        if isa(o.type, MemRefType) or isa(o.type, TensorType):
             concrete_operands.append(payload.args[shaped_count])
             shaped_count += 1
         else:
diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
new file mode 100644
index 00000000..a6791aaa
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_mini_mlir_tensor.py
@@ -0,0 +1,418 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+# Small conv2d
+N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 3, 3, 3, 1, 1, "float32"
+a = O.tensor((N, H + R - 1, W + S - 1, C), dtype, name="I")
+b = O.tensor((R, S, C, F), dtype, name="W")
+
+with O.graph(name="conv2d_nhwc_mini") as gb:
+    O.conv2d(a, b, stride=(SH, SW), name="O")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="conv2d_nhwc_mini_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x8x8x16xf32>) -> tensor<1x8x8x16xf32>
+# CHECK-NEXT:     %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x10x10x3xf32>, tensor<3x3x3x16xf32>) outs(%1 : tensor<1x8x8x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
+# CHECK-NEXT:       %3 = arith.mulf %in, %in_0 : f32
+# CHECK-NEXT:       %4 = arith.addf %out, %3 : f32
+# CHECK-NEXT:       linalg.yield %4 : f32
+# CHECK-NEXT:     } -> tensor<1x8x8x16xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./f" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_19 "./c" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
+# CHECK-NEXT:       %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
+# CHECK-NEXT:         %c0_7 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_8 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
+# CHECK-NEXT:       %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:             %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:             %c3 = arith.constant 3 : index
+# CHECK-NEXT:             %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:             %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32>
+# CHECK-NEXT:               %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:               %c3_31 = arith.constant 3 : index
+# CHECK-NEXT:               %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:               %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3_38 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_39 = arith.constant 1 : index
+# CHECK-NEXT:                 %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_45: f32, %out: f32):
+# CHECK-NEXT:                     %10 = arith.mulf %in, %in_45 : f32
+# CHECK-NEXT:                     %11 = arith.addf %out, %10 : f32
+# CHECK-NEXT:                     linalg.yield %11 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: tensor<1x10x10x3xf32> {llvm.noalias}, %arg1: tensor<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %0) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
+# CHECK-NEXT:       %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_4 to %c8 step %c1_5 iter_args(%arg6 = %extracted_slice) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
+# CHECK-NEXT:         %c0_7 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_8 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_7 to %c8_8 step %c1_9 iter_args(%arg8 = %extracted_slice_6) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_11 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_12 to %c16 step %c1_13 iter_args(%arg10 = %extracted_slice_11) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_15 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_16 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_16 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_14 : tensor<1x1x8x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_10 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_10 : tensor<1x8x8x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_1 to %c1_2 step %c1_3 iter_args(%arg4 = %1) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 10, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x10x10x3xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_5 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x8x8x16xf32>
+# CHECK-NEXT:       %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_6 to %c8 step %c1_7 iter_args(%arg6 = %extracted_slice_5) -> (tensor<1x8x8x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %arg5, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : tensor<1x10x10x3xf32> to tensor<1x3x10x3xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> to tensor<1x1x8x16xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c8_12 = arith.constant 8 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_11 to %c8_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x8x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %extracted_slice_8[0, 0, %arg7, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x10x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:           %extracted_slice_16 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [3, 3, 3, 16] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_17 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_18 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0_18 to %c16 step %c1_19 iter_args(%arg10 = %extracted_slice_17) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[0, 0, 0, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x3x3x3xf32>
+# CHECK-NEXT:             %extracted_slice_22 = tensor.extract_slice %extracted_slice_16[0, 0, 0, %arg9] [3, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:             %c3 = arith.constant 3 : index
+# CHECK-NEXT:             %c1_25 = arith.constant 1 : index
+# CHECK-NEXT:             %6 = scf.for %arg11 = %c0_24 to %c3 step %c1_25 iter_args(%arg12 = %extracted_slice_23) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_27 = tensor.extract_slice %extracted_slice_21[0, %arg11, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32>
+# CHECK-NEXT:               %extracted_slice_28 = tensor.extract_slice %extracted_slice_22[%arg11, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<3x3x3x1xf32> to tensor<1x3x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_29 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_30 = arith.constant 0 : index
+# CHECK-NEXT:               %c3_31 = arith.constant 3 : index
+# CHECK-NEXT:               %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:               %7 = scf.for %arg13 = %c0_30 to %c3_31 step %c1_32 iter_args(%arg14 = %extracted_slice_29) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %extracted_slice_27[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_35 = tensor.extract_slice %extracted_slice_28[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3_38 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_39 = arith.constant 1 : index
+# CHECK-NEXT:                 %8 = scf.for %arg15 = %c0_37 to %c3_38 step %c1_39 iter_args(%arg16 = %extracted_slice_36) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_41 = tensor.extract_slice %extracted_slice_34[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_42 = tensor.extract_slice %extracted_slice_35[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_43 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_41, %extracted_slice_42 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_43 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_45: f32, %out: f32):
+# CHECK-NEXT:                     %10 = arith.mulf %in, %in_45 : f32
+# CHECK-NEXT:                     %11 = arith.addf %out, %10 : f32
+# CHECK-NEXT:                     linalg.yield %11 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_44 = tensor.insert_slice %9 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_44 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_40 = tensor.insert_slice %8 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_33 = tensor.insert_slice %7 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_33 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_26 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_26 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_20 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x8x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_20 : tensor<1x1x8x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : tensor<1x1x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x8x8x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 8, 8, 16] [1, 1, 1, 1] : tensor<1x8x8x16xf32> into tensor<1x8x8x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x8x8x16xf32>, memref<1x8x8x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_mini(%arg0: memref<1x10x10x3xf32> {llvm.noalias}, %arg1: memref<3x3x3x16xf32> {llvm.noalias}, %arg2: memref<1x8x8x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c8 = arith.constant 8 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x8x8x16xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_1) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_3 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_3 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_3, %subview_4 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_2 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_0 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (memref<1x8x8x16xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[0, %arg3, 0, 0] [1, 3, 10, 3] [1, 1, 1, 1] : memref<1x10x10x3xf32> to memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c8 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %subview[0, 0, %arg5, 0] [1, 3, 3, 3] [1, 1, 1, 1] : memref<1x3x10x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %arg1[0, 0, 0, %arg7] [3, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x16xf32> to memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           %4 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %subview_6) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_8 = memref.subview %subview_2[0, %arg9, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : memref<1x3x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_9 = memref.subview %subview_5[%arg9, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : memref<3x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %5 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_10 = memref.subview %subview_8[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x3x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_11 = memref.subview %subview_9[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x3x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %6 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                 %subview_12 = memref.subview %subview_10[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[300, 30, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_13 = memref.subview %subview_11[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[144, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_12, %subview_13 : memref<1x1x1x1xf32, strided<[300, 30, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[144, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_14: f32, %out: f32):
+# CHECK-NEXT:                   %7 = arith.mulf %in, %in_14 : f32
+# CHECK-NEXT:                   %8 = arith.addf %out, %7 : f32
+# CHECK-NEXT:                   linalg.yield %8 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:               } {"./c"}
+# CHECK-NEXT:               scf.yield %6 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:             } {"./s"}
+# CHECK-NEXT:             scf.yield %5 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           } {"./r"}
+# CHECK-NEXT:           %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %4, %subview_7 : memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_4 : memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x8x8x16xf32> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>> to memref<1x1x8x16xf32, strided<[1024, 128, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x8x8x16xf32>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     memref.copy %1, %arg2 : memref<1x8x8x16xf32> to memref<1x8x8x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: conv2d_nhwc_mini
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 1x10x10x3xfloat32
+# CHECK-NEXT:   - %1 : 3x3x3x16xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %2 : 1x8x8x16xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: conv2d(%0, %1, stride=(1, 1)) {name = 'O'} : [1x10x10x3xfloat32, 3x3x3x16xfloat32] -> [1x8x8x16xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py
new file mode 100644
index 00000000..a363b4e1
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_conv2d_r181_mlir_tensor.py
@@ -0,0 +1,708 @@
+# RUN: python %s 2>&1 | filecheck %s
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+from xtc.artifacts import get_operation
+from xtc.artifacts import get_operation
+
+op = get_operation("conv2d", "ResNet18_01")
+N, H, W, F, R, S, C = [op["dims"][k] for k in ["n", "h", "w", "f", "r", "s", "c"]]
+SH, SW = [op["params"][k] for k in ["SH", "SW"]]
+dtype = "float32"
+
+a = O.tensor((N, H + R - 1, W + S - 1, C), dtype)
+b = O.tensor((R, S, C, F), dtype)
+
+with O.graph(name="conv2d_nhwc_r181") as gb:
+    O.conv2d(a, b, stride=(SH, SW), name="O")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sch.tile("w", {"w1": 4})
+sch.tile("f", {"f1": 16})
+sch.interchange(["b", "h", "w", "f", "r", "s", "c", "w1", "f1"])
+sch.vectorize(["f1"])
+sch.unroll({"w1": 4, "c": 3})
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="conv2d_nhwc_r181_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%0 : tensor<1x112x112x64xf32>) -> tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<1x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%1 : tensor<1x112x112x64xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_0: f32, %out: f32):
+# CHECK-NEXT:       %3 = arith.mulf %in, %in_0 : f32
+# CHECK-NEXT:       %4 = arith.addf %out, %3 : f32
+# CHECK-NEXT:       linalg.yield %4 : f32
+# CHECK-NEXT:     } -> tensor<1x112x112x64xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_O_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./f" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_O_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 4, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 16, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./r" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./s" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_19 "./c" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_21 "./w1" : !transform.any_op
+# CHECK-NEXT:     transform.include @_vecto failures(suppress) (%tiled_linalg_op_20) : (!transform.any_op) -> ()
+# CHECK-NEXT:     transform.loop.unroll %loops_21 {factor = 4 : i64} : !transform.any_op
+# CHECK-NEXT:     transform.loop.unroll %loops_19 {factor = 3 : i64} : !transform.any_op
+# CHECK-NEXT:     %2 = transform.get_parent_op %loops_7 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.reduction_to_contract
+# CHECK-NEXT:       transform.apply_patterns.vector.transfer_permutation_patterns
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_outerproduct
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_contraction
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c6 = arith.constant 6 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c7 = arith.constant 7 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c64 = arith.constant 64 : index
+# CHECK-NEXT:     %c112 = arith.constant 112 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32>
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32>
+# CHECK-NEXT:       %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32>
+# CHECK-NEXT:         %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %6 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32>
+# CHECK-NEXT:           %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32>
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32>
+# CHECK-NEXT:             %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:             %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:               %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32>
+# CHECK-NEXT:               %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32>
+# CHECK-NEXT:               %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:                 %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32>
+# CHECK-NEXT:                 %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32>
+# CHECK-NEXT:                 %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               scf.yield %9 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: tensor<1x230x230x3xf32> {llvm.noalias}, %arg1: tensor<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c6 = arith.constant 6 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c7 = arith.constant 7 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c64 = arith.constant 64 : index
+# CHECK-NEXT:     %c112 = arith.constant 112 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %extracted_slice_0 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %extracted_slice_0) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %extracted_slice_2 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x1x64xf32>
+# CHECK-NEXT:           %5 = scf.for %arg9 = %c0 to %c64 step %c1 iter_args(%arg10 = %extracted_slice_2) -> (tensor<1x1x1x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x64xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %6 = linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_5 = tensor.insert_slice %6 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_5 : tensor<1x1x1x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_3 = tensor.insert_slice %5 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 64] [1, 1, 1, 1] : tensor<1x1x1x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_3 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_1 = tensor.insert_slice %4 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_1 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1 iter_args(%arg4 = %1) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : tensor<1x230x230x3xf32> to tensor<1x229x229x3xf32>
+# CHECK-NEXT:       %extracted_slice_0 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x112x112x64xf32>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %extracted_slice_0) -> (tensor<1x112x112x64xf32>) {
+# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %4, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : tensor<1x229x229x3xf32> to tensor<1x7x229x3xf32>
+# CHECK-NEXT:         %extracted_slice_2 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> to tensor<1x1x112x64xf32>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c112 step %c4 iter_args(%arg8 = %extracted_slice_2) -> (tensor<1x1x112x64xf32>) {
+# CHECK-NEXT:           %6 = affine.apply #map(%arg7)
+# CHECK-NEXT:           %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, %6, 0] [1, 7, 13, 3] [1, 1, 1, 1] : tensor<1x7x229x3xf32> to tensor<1x7x13x3xf32>
+# CHECK-NEXT:           %extracted_slice_5 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> to tensor<1x1x4x64xf32>
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %extracted_slice_5) -> (tensor<1x1x4x64xf32>) {
+# CHECK-NEXT:             %extracted_slice_7 = tensor.extract_slice %arg1[0, 0, 0, %arg9] [7, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x64xf32> to tensor<7x7x3x16xf32>
+# CHECK-NEXT:             %extracted_slice_8 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x64xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:             %8 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %extracted_slice_8) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:               %extracted_slice_10 = tensor.extract_slice %extracted_slice_4[0, %arg11, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : tensor<1x7x13x3xf32> to tensor<1x1x13x3xf32>
+# CHECK-NEXT:               %extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg11, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : tensor<7x7x3x16xf32> to tensor<1x7x3x16xf32>
+# CHECK-NEXT:               %9 = scf.for %arg13 = %c0 to %c7 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:                 %extracted_slice_12 = tensor.extract_slice %extracted_slice_10[0, 0, %arg13, 0] [1, 1, 7, 3] [1, 1, 1, 1] : tensor<1x1x13x3xf32> to tensor<1x1x7x3xf32>
+# CHECK-NEXT:                 %extracted_slice_13 = tensor.extract_slice %extracted_slice_11[0, %arg13, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : tensor<1x7x3x16xf32> to tensor<1x1x3x16xf32>
+# CHECK-NEXT:                 %extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c0] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_15 = tensor.extract_slice %extracted_slice_13[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_16 = tensor.extract_slice %extracted_slice_14[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_17 = tensor.extract_slice %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_16, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_17 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_18 = tensor.insert_slice %10 into %arg14[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_19 = tensor.extract_slice %extracted_slice_14[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_20 = tensor.extract_slice %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %11 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_19, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_20 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_21 = tensor.insert_slice %11 into %inserted_slice_18[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_22 = tensor.extract_slice %extracted_slice_14[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_23 = tensor.extract_slice %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_22, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_23 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_24 = tensor.insert_slice %12 into %inserted_slice_21[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_25 = tensor.extract_slice %extracted_slice_14[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_26 = tensor.extract_slice %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %13 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_25, %extracted_slice_15 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_26 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_27 = tensor.insert_slice %13 into %inserted_slice_24[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_28 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c1] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_29 = tensor.extract_slice %extracted_slice_13[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_30 = tensor.extract_slice %extracted_slice_28[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_31 = tensor.extract_slice %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %14 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_30, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_31 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_32 = tensor.insert_slice %14 into %inserted_slice_27[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_33 = tensor.extract_slice %extracted_slice_28[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_34 = tensor.extract_slice %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %15 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_33, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_34 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_35 = tensor.insert_slice %15 into %inserted_slice_32[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_36 = tensor.extract_slice %extracted_slice_28[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_37 = tensor.extract_slice %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %16 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_36, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_37 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_38 = tensor.insert_slice %16 into %inserted_slice_35[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_28[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %17 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_39, %extracted_slice_29 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_40 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_41 = tensor.insert_slice %17 into %inserted_slice_38[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_42 = tensor.extract_slice %extracted_slice_12[0, 0, 0, %c2] [1, 1, 7, 1] [1, 1, 1, 1] : tensor<1x1x7x3xf32> to tensor<1x1x7x1xf32>
+# CHECK-NEXT:                 %extracted_slice_43 = tensor.extract_slice %extracted_slice_13[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x3x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %extracted_slice_44 = tensor.extract_slice %extracted_slice_42[0, 0, %c0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_45 = tensor.extract_slice %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %18 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_45 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_46 = tensor.insert_slice %18 into %inserted_slice_41[0, 0, %c0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_47 = tensor.extract_slice %extracted_slice_42[0, 0, %c2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_48 = tensor.extract_slice %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %19 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_47, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_48 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_49 = tensor.insert_slice %19 into %inserted_slice_46[0, 0, %c1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_50 = tensor.extract_slice %extracted_slice_42[0, 0, %c4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_51 = tensor.extract_slice %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %20 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_50, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_51 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_52 = tensor.insert_slice %20 into %inserted_slice_49[0, 0, %c2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 %extracted_slice_53 = tensor.extract_slice %extracted_slice_42[0, 0, %c6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x7x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %extracted_slice_54 = tensor.extract_slice %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %21 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_53, %extracted_slice_43 : tensor<1x1x1x1xf32>, tensor<1x1x1x16xf32>) outs(%extracted_slice_54 : tensor<1x1x1x16xf32>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_56: f32, %out: f32):
+# CHECK-NEXT:                   %22 = arith.mulf %in, %in_56 : f32
+# CHECK-NEXT:                   %23 = arith.addf %out, %22 : f32
+# CHECK-NEXT:                   linalg.yield %23 : f32
+# CHECK-NEXT:                 } -> tensor<1x1x1x16xf32>
+# CHECK-NEXT:                 %inserted_slice_55 = tensor.insert_slice %21 into %inserted_slice_52[0, 0, %c3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_55 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               scf.yield %9 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_9 = tensor.insert_slice %8 into %arg10[0, 0, 0, %arg9] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x1x4x64xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_9 : tensor<1x1x4x64xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_6 = tensor.insert_slice %7 into %arg8[0, 0, %arg7, 0] [1, 1, 4, 64] [1, 1, 1, 1] : tensor<1x1x4x64xf32> into tensor<1x1x112x64xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_6 : tensor<1x1x112x64xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_3 = tensor.insert_slice %5 into %arg6[0, %arg5, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : tensor<1x1x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_3 : tensor<1x112x112x64xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0, 0, 0] [1, 112, 112, 64] [1, 1, 1, 1] : tensor<1x112x112x64xf32> into tensor<1x112x112x64xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<1x112x112x64xf32>, memref<1x112x112x64xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @conv2d_nhwc_r181(%arg0: memref<1x230x230x3xf32> {llvm.noalias}, %arg1: memref<7x7x3x64xf32> {llvm.noalias}, %arg2: memref<1x112x112x64xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c7 = arith.constant 7 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c64 = arith.constant 64 : index
+# CHECK-NEXT:     %c112 = arith.constant 112 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x112x112x64xf32>) {
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c64 step %c1 iter_args(%arg8 = %subview_2) -> (memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_4 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           linalg.fill {__xtc_id_O_0_} ins(%cst : f32) outs(%subview_4 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_4, %subview_5 : memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_3 : memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     %subview = memref.subview %arg0[0, 0, 0, 0] [1, 229, 229, 3] [1, 1, 1, 1] : memref<1x230x230x3xf32> to memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %0) -> (memref<1x112x112x64xf32>) {
+# CHECK-NEXT:       %2 = affine.apply #map(%arg3)
+# CHECK-NEXT:       %subview_0 = memref.subview %subview[0, %2, 0, 0] [1, 7, 229, 3] [1, 1, 1, 1] : memref<1x229x229x3xf32, strided<[158700, 690, 3, 1]>> to memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c112 step %c4 iter_args(%arg6 = %subview_1) -> (memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:         %4 = affine.apply #map(%arg5)
+# CHECK-NEXT:         %subview_3 = memref.subview %subview_0[0, 0, %4, 0] [1, 7, 13, 3] [1, 1, 1, 1] : memref<1x7x229x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c64 step %c16 iter_args(%arg8 = %subview_4) -> (memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_6 = memref.subview %arg1[0, 0, 0, %arg7] [7, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x64xf32> to memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:           %subview_7 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           %6 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %subview_7) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_9 = memref.subview %subview_3[0, %arg9, 0, 0] [1, 1, 13, 3] [1, 1, 1, 1] : memref<1x7x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_10 = memref.subview %subview_6[%arg9, 0, 0, 0] [1, 7, 3, 16] [1, 1, 1, 1] : memref<7x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:             %7 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_11 = memref.subview %subview_9[0, 0, %arg11, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x1x13x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_12 = memref.subview %subview_10[0, %arg11, 0, 0] [1, 1, 3, 16] [1, 1, 1, 1] : memref<1x7x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_13 = memref.subview %subview_11[0, 0, 0, 0] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_14 = memref.subview %subview_12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_15 = memref.subview %subview_13[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_16 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_15, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_16 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_17 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_16, %subview_17 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_18 = memref.subview %subview_13[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_19 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_18, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_19 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_20 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_19, %subview_20 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_21 = memref.subview %subview_13[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_22 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_21, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_22 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_23 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_22, %subview_23 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_24 = memref.subview %subview_13[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_25 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_24, %subview_14 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_25 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_26 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_25, %subview_26 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_27 = memref.subview %subview_11[0, 0, 0, 1] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_28 = memref.subview %subview_12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_29 = memref.subview %subview_27[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_30 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_29, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_30 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_31 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_30, %subview_31 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_32 = memref.subview %subview_27[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_33 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_32, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_33 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_34 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_33, %subview_34 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_35 = memref.subview %subview_27[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_36 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_35, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_36 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_37 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_36, %subview_37 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_38 = memref.subview %subview_27[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_39 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_38, %subview_28 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_39 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_40 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_39, %subview_40 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_41 = memref.subview %subview_11[0, 0, 0, 2] [1, 1, 7, 1] [1, 1, 1, 1] : memref<1x1x7x3xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_42 = memref.subview %subview_12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x3x16xf32, strided<[1344, 192, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_43 = memref.subview %subview_41[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_44 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_43, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_44 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_45 = memref.subview %arg12[0, 0, 0, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_44, %subview_45 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_46 = memref.subview %subview_41[0, 0, 2, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_47 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_46, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_47 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_48 = memref.subview %arg12[0, 0, 1, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_47, %subview_48 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_49 = memref.subview %subview_41[0, 0, 4, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_50 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_49, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_50 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_51 = memref.subview %arg12[0, 0, 2, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_50, %subview_51 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               %subview_52 = memref.subview %subview_41[0, 0, 6, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x7x1xf32, strided<[158700, 690, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_53 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_52, %subview_42 : memref<1x1x1x1xf32, strided<[158700, 690, 3, 1], offset: ?>>, memref<1x1x1x16xf32, strided<[1344, 192, 64, 1], offset: ?>>) outs(%subview_53 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>) attrs =  {__xtc_id_O_} {
+# CHECK-NEXT:               ^bb0(%in: f32, %in_55: f32, %out: f32):
+# CHECK-NEXT:                 %8 = arith.mulf %in, %in_55 : f32
+# CHECK-NEXT:                 %9 = arith.addf %out, %8 : f32
+# CHECK-NEXT:                 linalg.yield %9 : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_54 = memref.subview %arg12[0, 0, 3, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %subview_53, %subview_54 : memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:               scf.yield %arg12 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:             } {"./s"}
+# CHECK-NEXT:             scf.yield %7 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           } {"./r"}
+# CHECK-NEXT:           %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %6, %subview_8 : memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 4, 64] [1, 1, 1, 1] : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_5 : memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x4x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 112, 64] [1, 1, 1, 1] : memref<1x112x112x64xf32> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_2 : memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>> to memref<1x1x112x64xf32, strided<[802816, 7168, 64, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x112x112x64xf32>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     memref.copy %1, %arg2 : memref<1x112x112x64xf32> to memref<1x112x112x64xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: conv2d_nhwc_r181
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 1x230x230x3xfloat32
+# CHECK-NEXT:   - %1 : 7x7x3x64xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %2 : 1x112x112x64xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: conv2d(%0, %1, stride=(2, 2)) {name = 'O'} : [1x230x230x3xfloat32, 7x7x3x64xfloat32] -> [1x112x112x64xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
new file mode 100644
index 00000000..dd676fa2
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_mlir_tensor.py
@@ -0,0 +1,238 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="matmul") as gb:
+    O.matmul(a, b, name="C")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="matmul_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_C_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_C_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_C_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./k" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_6 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_12 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_3 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_3 to %c32 step %c1_4 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %4 = linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_6 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_6 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg3, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_3 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:       %c32 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0_5 to %c32 step %c1_6 iter_args(%arg6 = %extracted_slice_4) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_8 = tensor.extract_slice %extracted_slice_3[0, %arg5] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_9 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0_10 to %c512 step %c1_11 iter_args(%arg8 = %extracted_slice_9) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %extracted_slice_7[0, %arg7] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_14 = tensor.extract_slice %extracted_slice_8[%arg7, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %5 = linalg.matmul {__xtc_id_C_} ins(%extracted_slice_13, %extracted_slice_14 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %5 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_12 = tensor.insert_slice %4 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_12 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %3 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %2 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_C_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg3, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg1[0, %arg5] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %3 = scf.for %arg7 = %c0 to %c512 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %subview[0, %arg7] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_2[%arg7, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_C_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %3, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %2, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     memref.copy %1, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: matmul
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 4x512xfloat32
+# CHECK-NEXT:   - %1 : 512x32xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %2 : 4x32xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: matmul(%0, %1) {name = 'C'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
new file mode 100644
index 00000000..b8352285
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_matmul_relu_mlir_tensor.py
@@ -0,0 +1,377 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="matmul_relu") as gb:
+    m = O.matmul(a, b, name="matmul")
+    O.relu(m, name="relu")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler(default_node="matmul")
+sch.tile("i", {"i1": 2})
+sch.tile("j", {"j1": 16})
+sch.interchange(["k", "i", "j", "i1", "j1"])
+sch.vectorize(["j1"])
+sch.unroll({"i1": 2})
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="matmul_relu_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_matmul_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %4 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %cst_0 : tensor<4x32xf32>, f32) outs(%3 : tensor<4x32xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
+# CHECK-NEXT:       %5 = arith.maximumf %in, %in_1 : f32
+# CHECK-NEXT:       linalg.yield %5 : f32
+# CHECK-NEXT:     } -> tensor<4x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_matmul_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_matmul_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./k" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [2, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./i1" : !transform.any_op
+# CHECK-NEXT:     transform.include @_vecto failures(suppress) (%tiled_linalg_op_8) : (!transform.any_op) -> ()
+# CHECK-NEXT:     transform.loop.unroll %loops_9 {factor = 2 : i64} : !transform.any_op
+# CHECK-NEXT:     %2 = transform.get_parent_op %loops_3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.reduction_to_contract
+# CHECK-NEXT:       transform.apply_patterns.vector.transfer_permutation_patterns
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     transform.apply_patterns to %2 {
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_outerproduct
+# CHECK-NEXT:       transform.apply_patterns.vector.lower_contraction
+# CHECK-NEXT:     } : !transform.any_op
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_relu_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./i" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
+# CHECK-NEXT:     %0 = ub.poison : f32
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_5 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) {
+# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %13 = vector.broadcast %12 : f32 to vector<16xf32>
+# CHECK-NEXT:           %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %15 = vector.fma %13, %11, %14 : vector<16xf32>
+# CHECK-NEXT:           %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %23 = vector.broadcast %22 : f32 to vector<16xf32>
+# CHECK-NEXT:           %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %25 = vector.fma %23, %21, %24 : vector<16xf32>
+# CHECK-NEXT:           %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_15 : tensor<2x32xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:       } {"./i"}
+# CHECK-NEXT:       scf.yield %6 : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./k"}
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_2 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_5: f32, %out: f32):
+# CHECK-NEXT:         %7 = arith.maximumf %in, %in_5 : f32
+# CHECK-NEXT:         linalg.yield %7 : f32
+# CHECK-NEXT:       } -> tensor<1x32xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %cst = arith.constant dense<0.000000e+00> : vector<1x16xf32>
+# CHECK-NEXT:     %0 = ub.poison : f32
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_4 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_matmul_0_} ins(%cst_0 : f32) outs(%extracted_slice_4 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_5 = tensor.insert_slice %7 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_5 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %2) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[0, %arg3] [4, 1] [1, 1] : tensor<4x512xf32> to tensor<4x1xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg1[%arg3, 0] [1, 32] [1, 1] : tensor<512x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_5 = tensor.extract_slice %extracted_slice[%arg5, 0] [2, 1] [1, 1] : tensor<4x1xf32> to tensor<2x1xf32>
+# CHECK-NEXT:         %extracted_slice_6 = tensor.extract_slice %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<4x32xf32> to tensor<2x32xf32>
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %extracted_slice_6) -> (tensor<2x32xf32>) {
+# CHECK-NEXT:           %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, %arg7] [1, 16] [1, 1] : tensor<1x32xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %extracted_slice_8 = tensor.extract_slice %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x32xf32> to tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%c0, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_10 = tensor.extract_slice %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %extracted_slice_9[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %9 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.transfer_read %extracted_slice_10[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %11 = vector.extract %9[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %12 = vector.extract %8[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %13 = vector.broadcast %12 : f32 to vector<16xf32>
+# CHECK-NEXT:           %14 = vector.extract %10[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %15 = vector.fma %13, %11, %14 : vector<16xf32>
+# CHECK-NEXT:           %16 = vector.insert %15, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.transfer_write %16, %extracted_slice_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_11 = tensor.insert_slice %17 into %extracted_slice_8[%c0, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[%c1, 0] [1, 1] [1, 1] : tensor<2x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_13 = tensor.extract_slice %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:           %18 = vector.transfer_read %extracted_slice_12[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x1xf32>, vector<1x1xf32>
+# CHECK-NEXT:           %19 = vector.transfer_read %extracted_slice_7[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.transfer_read %extracted_slice_13[%c0, %c0], %0 {in_bounds = [true, true]} : tensor<1x16xf32>, vector<1x16xf32>
+# CHECK-NEXT:           %21 = vector.extract %19[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %22 = vector.extract %18[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %23 = vector.broadcast %22 : f32 to vector<16xf32>
+# CHECK-NEXT:           %24 = vector.extract %20[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %25 = vector.fma %23, %21, %24 : vector<16xf32>
+# CHECK-NEXT:           %26 = vector.insert %25, %cst [0] : vector<16xf32> into vector<1x16xf32>
+# CHECK-NEXT:           %27 = vector.transfer_write %26, %extracted_slice_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, tensor<1x16xf32>
+# CHECK-NEXT:           %inserted_slice_14 = tensor.insert_slice %27 into %inserted_slice_11[%c1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
+# CHECK-NEXT:           %inserted_slice_15 = tensor.insert_slice %inserted_slice_14 into %arg8[0, %arg7] [2, 16] [1, 1] : tensor<2x16xf32> into tensor<2x32xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_15 : tensor<2x32xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %inserted_slice = tensor.insert_slice %7 into %arg6[%arg5, 0] [2, 32] [1, 1] : tensor<2x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:       } {"./i"}
+# CHECK-NEXT:       scf.yield %6 : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./k"}
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_2 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c4_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %3[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %extracted_slice_4 = tensor.extract_slice %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %cst_0 : tensor<1x32xf32>, f32) outs(%extracted_slice_4 : tensor<1x32xf32>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_5: f32, %out: f32):
+# CHECK-NEXT:         %7 = arith.maximumf %in, %in_5 : f32
+# CHECK-NEXT:         linalg.yield %7 : f32
+# CHECK-NEXT:       } -> tensor<1x32xf32>
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<4x32xf32>, memref<4x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1) -> ()>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul_relu(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<4x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = ub.poison : f32
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c2 = arith.constant 2 : index
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %alloca) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %1) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[0, %arg3] [4, 1] [1, 1] : memref<4x512xf32> to memref<4x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg1[%arg3, 0] [1, 32] [1, 1] : memref<512x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (memref<4x32xf32>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %subview[%arg5, 0] [2, 1] [1, 1] : memref<4x1xf32, strided<[512, 1], offset: ?>> to memref<2x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:         %subview_2 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c32 step %c16 iter_args(%arg8 = %subview_2) -> (memref<2x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_4 = memref.subview %subview_0[0, %arg7] [1, 16] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_1[0, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_7 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %6 = vector.transfer_read %subview_6[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
+# CHECK-NEXT:           %7 = vector.transfer_read %subview_4[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %8 = vector.transfer_read %subview_7[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %9 = vector.extract %7[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %10 = vector.extract %6[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %11 = vector.broadcast %10 : f32 to vector<16xf32>
+# CHECK-NEXT:           %12 = vector.extract %8[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %13 = vector.fma %11, %9, %12 : vector<16xf32>
+# CHECK-NEXT:           %14 = vector.broadcast %13 : vector<16xf32> to vector<1x16xf32>
+# CHECK-NEXT:           vector.transfer_write %14, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_8 = memref.subview %subview_5[0, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_7, %subview_8 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_9 = memref.subview %subview_1[1, 0] [1, 1] [1, 1] : memref<2x1xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_10 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %15 = vector.transfer_read %subview_9[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x1xf32, strided<[512, 1], offset: ?>>, vector<1x1xf32>
+# CHECK-NEXT:           %16 = vector.transfer_read %subview_10[%c0, %c0], %0 {in_bounds = [true, true]} : memref<1x16xf32, strided<[32, 1], offset: ?>>, vector<1x16xf32>
+# CHECK-NEXT:           %17 = vector.extract %15[0, 0] : f32 from vector<1x1xf32>
+# CHECK-NEXT:           %18 = vector.broadcast %17 : f32 to vector<16xf32>
+# CHECK-NEXT:           %19 = vector.extract %16[0] : vector<16xf32> from vector<1x16xf32>
+# CHECK-NEXT:           %20 = vector.fma %18, %9, %19 : vector<16xf32>
+# CHECK-NEXT:           %21 = vector.broadcast %20 : vector<16xf32> to vector<1x16xf32>
+# CHECK-NEXT:           vector.transfer_write %21, %subview_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x16xf32>, memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_11 = memref.subview %subview_5[1, 0] [1, 16] [1, 1] : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_10, %subview_11 : memref<1x16xf32, strided<[32, 1], offset: ?>> to memref<1x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           %subview_12 = memref.subview %arg8[0, %arg7] [2, 16] [1, 1] : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_5, %subview_12 : memref<2x16xf32, strided<[32, 1], offset: ?>> to memref<2x16xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[%arg5, 0] [2, 32] [1, 1] : memref<4x32xf32> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_3 : memref<2x32xf32, strided<[32, 1], offset: ?>> to memref<2x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<4x32xf32>
+# CHECK-NEXT:       } {"./i"}
+# CHECK-NEXT:       scf.yield %4 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./k"}
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %2[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%subview, %cst : memref<1x32xf32, strided<[32, 1], offset: ?>>, f32) outs(%subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>>) attrs =  {__xtc_id_relu_} {
+# CHECK-NEXT:       ^bb0(%in: f32, %in_2: f32, %out: f32):
+# CHECK-NEXT:         %4 = arith.maximumf %in, %in_2 : f32
+# CHECK-NEXT:         linalg.yield %4 : f32
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[%arg3, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %subview_0, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     memref.copy %3, %arg2 : memref<4x32xf32> to memref<4x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: matmul_relu
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 4x512xfloat32
+# CHECK-NEXT:   - %1 : 512x32xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %3 : 4x32xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: matmul(%0, %1) {name = 'matmul'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:   - %3: relu(%2) {name = 'relu'} : [4x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
new file mode 100644
index 00000000..2ebcefb3
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_conv2d_mlir_tensor.py
@@ -0,0 +1,930 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: module_mlir
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+# Small conv2d
+N, H, W, F, R, S, C, SH, SW, dtype = 1, 8, 8, 16, 5, 5, 3, 2, 2, "float32"
+a = O.tensor((N, H, W, C), dtype, name="I")
+b = O.tensor((R, S, C, F), dtype, name="W")
+
+with O.graph(name="pad_conv2d_nhwc_mini") as gb:
+    p = O.pad2d(a, padding=2, axes=(1, 2), name="pad")
+    O.conv2d(p, b, stride=(SH, SW), name="conv")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler()
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="pad_conv2d_nhwc_mini_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %padded = tensor.pad %arg0 nofold low[0, 2, 2, 0] high[0, 2, 2, 0] {
+# CHECK-NEXT:     ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
+# CHECK-NEXT:       tensor.yield %cst : f32
+# CHECK-NEXT:     } {__xtc_id_pad_} : tensor<1x8x8x3xf32> to tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %2 = linalg.fill {__xtc_id_conv_0_} ins(%cst_0 : f32) outs(%1 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%padded, %arg1 : tensor<1x12x12x3xf32>, tensor<5x5x3x16xf32>) outs(%2 : tensor<1x4x4x16xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:     ^bb0(%in: f32, %in_1: f32, %out: f32):
+# CHECK-NEXT:       %4 = arith.mulf %in, %in_1 : f32
+# CHECK-NEXT:       %5 = arith.addf %out, %4 : f32
+# CHECK-NEXT:       linalg.yield %5 : f32
+# CHECK-NEXT:     } -> tensor<1x4x4x16xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %tiled_linalg_op_0 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./c" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_conv_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./f" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_conv_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %2 tile_sizes [1, 0, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./b" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 1, 0, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./h" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_18, %loops_19 = transform.structured.tile_using_for %tiled_linalg_op_16 tile_sizes [0, 0, 1, 0, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_19 "./w" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_20, %loops_21 = transform.structured.tile_using_for %tiled_linalg_op_18 tile_sizes [0, 0, 0, 1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_21 "./f" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_22, %loops_23 = transform.structured.tile_using_for %tiled_linalg_op_20 tile_sizes [0, 0, 0, 0, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_23 "./r" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_24, %loops_25 = transform.structured.tile_using_for %tiled_linalg_op_22 tile_sizes [0, 0, 0, 0, 0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_25 "./s" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_26, %loops_27 = transform.structured.tile_using_for %tiled_linalg_op_24 tile_sizes [0, 0, 0, 0, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_27 "./c" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)>
+# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)>
+# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)>
+# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)>
+# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)>
+# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)>
+# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)>
+# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)>
+# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %6 = arith.cmpi eq, %c8, %c0_8 : index
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c8_10 = arith.constant 8 : index
+# CHECK-NEXT:       %7 = arith.cmpi eq, %c8_10, %c0_9 : index
+# CHECK-NEXT:       %8 = arith.ori %7, %6 : i1
+# CHECK-NEXT:       %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32>
+# CHECK-NEXT:         %10 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c12 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:           %12 = affine.apply #map(%arg5)
+# CHECK-NEXT:           %13 = affine.max #map1(%arg5)
+# CHECK-NEXT:           %14 = affine.apply #map2(%arg5)
+# CHECK-NEXT:           %15 = affine.max #map3(%arg5)
+# CHECK-NEXT:           %16 = affine.min #map4(%15)
+# CHECK-NEXT:           %17 = affine.apply #map5(%13)
+# CHECK-NEXT:           %18 = affine.apply #map6(%16)
+# CHECK-NEXT:           %19 = affine.min #map7(%16, %13)
+# CHECK-NEXT:           %20 = affine.max #map8(%19)
+# CHECK-NEXT:           %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = arith.cmpi eq, %20, %c0_13 : index
+# CHECK-NEXT:           %22 = affine.apply #map5(%20)
+# CHECK-NEXT:           %23 = affine.apply #map9(%13, %20)
+# CHECK-NEXT:           %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:           %c8_15 = arith.constant 8 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %c8_15, %c0_14 : index
+# CHECK-NEXT:           %25 = arith.ori %24, %21 : i1
+# CHECK-NEXT:           %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32>
+# CHECK-NEXT:             %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:             %27 = tensor.empty() : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:             %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:             %c12_21 = arith.constant 12 : index
+# CHECK-NEXT:             %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:             %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:               %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:               %29 = affine.max #map10(%13)
+# CHECK-NEXT:               %30 = affine.apply #map11(%13)
+# CHECK-NEXT:               %31 = affine.max #map12(%13)
+# CHECK-NEXT:               %32 = affine.min #map13(%31, %20)
+# CHECK-NEXT:               %33 = affine.apply #map5(%29)
+# CHECK-NEXT:               %34 = affine.apply #map14(%20, %32)
+# CHECK-NEXT:               %35 = affine.min #map15(%20, %32, %29)
+# CHECK-NEXT:               %36 = affine.max #map8(%35)
+# CHECK-NEXT:               %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:               %37 = arith.cmpi eq, %36, %c0_24 : index
+# CHECK-NEXT:               %38 = affine.apply #map5(%36)
+# CHECK-NEXT:               %39 = affine.apply #map9(%29, %36)
+# CHECK-NEXT:               %40 = affine.apply #map(%arg7)
+# CHECK-NEXT:               %41 = affine.max #map1(%arg7)
+# CHECK-NEXT:               %42 = affine.apply #map2(%arg7)
+# CHECK-NEXT:               %43 = affine.max #map3(%arg7)
+# CHECK-NEXT:               %44 = affine.min #map4(%43)
+# CHECK-NEXT:               %45 = affine.apply #map5(%41)
+# CHECK-NEXT:               %46 = affine.apply #map6(%44)
+# CHECK-NEXT:               %47 = affine.min #map7(%44, %41)
+# CHECK-NEXT:               %48 = affine.max #map8(%47)
+# CHECK-NEXT:               %c0_25 = arith.constant 0 : index
+# CHECK-NEXT:               %49 = arith.cmpi eq, %48, %c0_25 : index
+# CHECK-NEXT:               %50 = arith.ori %49, %37 : i1
+# CHECK-NEXT:               %51 = affine.apply #map5(%48)
+# CHECK-NEXT:               %52 = affine.apply #map9(%41, %48)
+# CHECK-NEXT:               %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                 %generated = tensor.generate  {
+# CHECK-NEXT:                 ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
+# CHECK-NEXT:                   tensor.yield %cst : f32
+# CHECK-NEXT:                 } : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %generated : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               } else {
+# CHECK-NEXT:                 %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32>
+# CHECK-NEXT:                 %c1_28 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2 = arith.constant 2 : index
+# CHECK-NEXT:                 %54 = tensor.empty() : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2_30 = arith.constant 2 : index
+# CHECK-NEXT:                 %c0_31 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:                 %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                   %c1_34 = arith.constant 1 : index
+# CHECK-NEXT:                   %56 = affine.max #map10(%29)
+# CHECK-NEXT:                   %57 = affine.apply #map11(%29)
+# CHECK-NEXT:                   %58 = affine.max #map12(%29)
+# CHECK-NEXT:                   %59 = affine.min #map13(%58, %36)
+# CHECK-NEXT:                   %60 = affine.apply #map5(%56)
+# CHECK-NEXT:                   %61 = affine.apply #map14(%36, %59)
+# CHECK-NEXT:                   %62 = affine.min #map15(%36, %59, %56)
+# CHECK-NEXT:                   %63 = affine.max #map8(%62)
+# CHECK-NEXT:                   %c0_35 = arith.constant 0 : index
+# CHECK-NEXT:                   %64 = arith.cmpi eq, %63, %c0_35 : index
+# CHECK-NEXT:                   %65 = affine.apply #map5(%63)
+# CHECK-NEXT:                   %66 = affine.apply #map9(%56, %63)
+# CHECK-NEXT:                   %c2_36 = arith.constant 2 : index
+# CHECK-NEXT:                   %67 = affine.max #map10(%41)
+# CHECK-NEXT:                   %68 = affine.apply #map11(%41)
+# CHECK-NEXT:                   %69 = affine.max #map12(%41)
+# CHECK-NEXT:                   %70 = affine.min #map13(%69, %48)
+# CHECK-NEXT:                   %71 = affine.apply #map5(%67)
+# CHECK-NEXT:                   %72 = affine.apply #map14(%48, %70)
+# CHECK-NEXT:                   %73 = affine.min #map15(%48, %70, %67)
+# CHECK-NEXT:                   %74 = affine.max #map8(%73)
+# CHECK-NEXT:                   %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                   %75 = arith.cmpi eq, %74, %c0_37 : index
+# CHECK-NEXT:                   %76 = arith.ori %75, %64 : i1
+# CHECK-NEXT:                   %77 = affine.apply #map5(%74)
+# CHECK-NEXT:                   %78 = affine.apply #map9(%67, %74)
+# CHECK-NEXT:                   %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                     %generated = tensor.generate  {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %generated : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   } else {
+# CHECK-NEXT:                     %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %cast_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                   %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %cast_33 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             } {"./w"}
+# CHECK-NEXT:             %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %cast : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         } {"./h"}
+# CHECK-NEXT:         scf.yield %11 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_12 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:           %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
+# CHECK-NEXT:       %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %7 = affine.apply #map16(%arg5)
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_16 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:         %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %9 = affine.apply #map16(%arg7)
+# CHECK-NEXT:           %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:           %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_28 = arith.constant 0 : index
+# CHECK-NEXT:             %c5 = arith.constant 5 : index
+# CHECK-NEXT:             %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:             %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
+# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_34 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_35 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:               %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_41 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_42 = arith.constant 1 : index
+# CHECK-NEXT:                 %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_48: f32, %out: f32):
+# CHECK-NEXT:                     %15 = arith.mulf %in, %in_48 : f32
+# CHECK-NEXT:                     %16 = arith.addf %out, %15 : f32
+# CHECK-NEXT:                     linalg.yield %16 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, -d0 + 2)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (d0 - 2)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (d0 - 2, 0)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (d0, 8)>
+# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map6 = affine_map<(d0) -> (-d0 + 8)>
+# CHECK-NEXT: #map7 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)>
+# CHECK-NEXT: #map8 = affine_map<(d0) -> (d0, 0)>
+# CHECK-NEXT: #map9 = affine_map<(d0, d1) -> (-d0 - d1 + 1)>
+# CHECK-NEXT: #map10 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map11 = affine_map<(d0) -> (-d0)>
+# CHECK-NEXT: #map12 = affine_map<(d0) -> (-d0, 0)>
+# CHECK-NEXT: #map13 = affine_map<(d0, d1) -> (d0, d1)>
+# CHECK-NEXT: #map14 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map15 = affine_map<(d0, d1, d2) -> (d0 - d1, -d2 + 1)>
+# CHECK-NEXT: #map16 = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map18 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map19 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: tensor<1x8x8x3xf32> {llvm.noalias}, %arg1: tensor<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_0 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c1 step %c1_0 iter_args(%arg4 = %1) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c8 = arith.constant 8 : index
+# CHECK-NEXT:       %6 = arith.cmpi eq, %c8, %c0_8 : index
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c8_10 = arith.constant 8 : index
+# CHECK-NEXT:       %7 = arith.cmpi eq, %c8_10, %c0_9 : index
+# CHECK-NEXT:       %8 = arith.ori %7, %6 : i1
+# CHECK-NEXT:       %9 = scf.if %8 -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice = tensor.extract_slice %arg0[%arg3, 0, 0, 0] [1, 8, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x8x8x3xf32>
+# CHECK-NEXT:         %10 = tensor.empty() : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c12 = arith.constant 12 : index
+# CHECK-NEXT:         %c1_12 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg5 = %c0_11 to %c12 step %c1_12 iter_args(%arg6 = %10) -> (tensor<1x12x12x3xf32>) {
+# CHECK-NEXT:           %12 = affine.apply #map(%arg5)
+# CHECK-NEXT:           %13 = affine.max #map1(%arg5)
+# CHECK-NEXT:           %14 = affine.apply #map2(%arg5)
+# CHECK-NEXT:           %15 = affine.max #map3(%arg5)
+# CHECK-NEXT:           %16 = affine.min #map4(%15)
+# CHECK-NEXT:           %17 = affine.apply #map5(%13)
+# CHECK-NEXT:           %18 = affine.apply #map6(%16)
+# CHECK-NEXT:           %19 = affine.min #map7(%16, %13)
+# CHECK-NEXT:           %20 = affine.max #map8(%19)
+# CHECK-NEXT:           %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = arith.cmpi eq, %20, %c0_13 : index
+# CHECK-NEXT:           %22 = affine.apply #map5(%20)
+# CHECK-NEXT:           %23 = affine.apply #map9(%13, %20)
+# CHECK-NEXT:           %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:           %c8_15 = arith.constant 8 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %c8_15, %c0_14 : index
+# CHECK-NEXT:           %25 = arith.ori %24, %21 : i1
+# CHECK-NEXT:           %26 = scf.if %25 -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_17 = tensor.extract_slice %extracted_slice[0, %16, 0, 0] [1, %20, 8, 3] [1, 1, 1, 1] : tensor<1x8x8x3xf32> to tensor<1x?x8x3xf32>
+# CHECK-NEXT:             %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:             %27 = tensor.empty() : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:             %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:             %c12_21 = arith.constant 12 : index
+# CHECK-NEXT:             %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:             %28 = scf.for %arg7 = %c0_20 to %c12_21 step %c1_22 iter_args(%arg8 = %27) -> (tensor<1x1x12x3xf32>) {
+# CHECK-NEXT:               %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:               %29 = affine.max #map10(%13)
+# CHECK-NEXT:               %30 = affine.apply #map11(%13)
+# CHECK-NEXT:               %31 = affine.max #map12(%13)
+# CHECK-NEXT:               %32 = affine.min #map13(%31, %20)
+# CHECK-NEXT:               %33 = affine.apply #map5(%29)
+# CHECK-NEXT:               %34 = affine.apply #map14(%20, %32)
+# CHECK-NEXT:               %35 = affine.min #map15(%20, %32, %29)
+# CHECK-NEXT:               %36 = affine.max #map8(%35)
+# CHECK-NEXT:               %c0_24 = arith.constant 0 : index
+# CHECK-NEXT:               %37 = arith.cmpi eq, %36, %c0_24 : index
+# CHECK-NEXT:               %38 = affine.apply #map5(%36)
+# CHECK-NEXT:               %39 = affine.apply #map9(%29, %36)
+# CHECK-NEXT:               %40 = affine.apply #map(%arg7)
+# CHECK-NEXT:               %41 = affine.max #map1(%arg7)
+# CHECK-NEXT:               %42 = affine.apply #map2(%arg7)
+# CHECK-NEXT:               %43 = affine.max #map3(%arg7)
+# CHECK-NEXT:               %44 = affine.min #map4(%43)
+# CHECK-NEXT:               %45 = affine.apply #map5(%41)
+# CHECK-NEXT:               %46 = affine.apply #map6(%44)
+# CHECK-NEXT:               %47 = affine.min #map7(%44, %41)
+# CHECK-NEXT:               %48 = affine.max #map8(%47)
+# CHECK-NEXT:               %c0_25 = arith.constant 0 : index
+# CHECK-NEXT:               %49 = arith.cmpi eq, %48, %c0_25 : index
+# CHECK-NEXT:               %50 = arith.ori %49, %37 : i1
+# CHECK-NEXT:               %51 = affine.apply #map5(%48)
+# CHECK-NEXT:               %52 = affine.apply #map9(%41, %48)
+# CHECK-NEXT:               %53 = scf.if %50 -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                 %generated = tensor.generate  {
+# CHECK-NEXT:                 ^bb0(%arg9: index, %arg10: index, %arg11: index, %arg12: index):
+# CHECK-NEXT:                   tensor.yield %cst : f32
+# CHECK-NEXT:                 } : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %generated : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               } else {
+# CHECK-NEXT:                 %extracted_slice_27 = tensor.extract_slice %extracted_slice_17[0, %32, %44, 0] [1, %36, %48, 3] [1, 1, 1, 1] : tensor<1x?x8x3xf32> to tensor<1x?x?x3xf32>
+# CHECK-NEXT:                 %c1_28 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2 = arith.constant 2 : index
+# CHECK-NEXT:                 %54 = tensor.empty() : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:                 %c2_30 = arith.constant 2 : index
+# CHECK-NEXT:                 %c0_31 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_32 = arith.constant 1 : index
+# CHECK-NEXT:                 %55 = scf.for %arg9 = %c0_31 to %c3 step %c1_32 iter_args(%arg10 = %54) -> (tensor<1x1x1x3xf32>) {
+# CHECK-NEXT:                   %c1_34 = arith.constant 1 : index
+# CHECK-NEXT:                   %56 = affine.max #map10(%29)
+# CHECK-NEXT:                   %57 = affine.apply #map11(%29)
+# CHECK-NEXT:                   %58 = affine.max #map12(%29)
+# CHECK-NEXT:                   %59 = affine.min #map13(%58, %36)
+# CHECK-NEXT:                   %60 = affine.apply #map5(%56)
+# CHECK-NEXT:                   %61 = affine.apply #map14(%36, %59)
+# CHECK-NEXT:                   %62 = affine.min #map15(%36, %59, %56)
+# CHECK-NEXT:                   %63 = affine.max #map8(%62)
+# CHECK-NEXT:                   %c0_35 = arith.constant 0 : index
+# CHECK-NEXT:                   %64 = arith.cmpi eq, %63, %c0_35 : index
+# CHECK-NEXT:                   %65 = affine.apply #map5(%63)
+# CHECK-NEXT:                   %66 = affine.apply #map9(%56, %63)
+# CHECK-NEXT:                   %c2_36 = arith.constant 2 : index
+# CHECK-NEXT:                   %67 = affine.max #map10(%41)
+# CHECK-NEXT:                   %68 = affine.apply #map11(%41)
+# CHECK-NEXT:                   %69 = affine.max #map12(%41)
+# CHECK-NEXT:                   %70 = affine.min #map13(%69, %48)
+# CHECK-NEXT:                   %71 = affine.apply #map5(%67)
+# CHECK-NEXT:                   %72 = affine.apply #map14(%48, %70)
+# CHECK-NEXT:                   %73 = affine.min #map15(%48, %70, %67)
+# CHECK-NEXT:                   %74 = affine.max #map8(%73)
+# CHECK-NEXT:                   %c0_37 = arith.constant 0 : index
+# CHECK-NEXT:                   %75 = arith.cmpi eq, %74, %c0_37 : index
+# CHECK-NEXT:                   %76 = arith.ori %75, %64 : i1
+# CHECK-NEXT:                   %77 = affine.apply #map5(%74)
+# CHECK-NEXT:                   %78 = affine.apply #map9(%67, %74)
+# CHECK-NEXT:                   %79 = scf.if %76 -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                     %generated = tensor.generate  {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %generated : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   } else {
+# CHECK-NEXT:                     %extracted_slice_39 = tensor.extract_slice %extracted_slice_27[0, %59, %70, %arg9] [1, %63, %74, 1] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %padded = tensor.pad %extracted_slice_39 nofold low[0, %56, %67, 0] high[0, %66, %78, 0] {
+# CHECK-NEXT:                     ^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
+# CHECK-NEXT:                       tensor.yield %cst : f32
+# CHECK-NEXT:                     } {__xtc_id_pad_} : tensor<1x?x?x1xf32> to tensor<1x?x?x1xf32>
+# CHECK-NEXT:                     %cast_40 = tensor.cast %padded : tensor<1x?x?x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                     scf.yield %cast_40 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                   %inserted_slice_38 = tensor.insert_slice %79 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x3xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_38 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %cast_33 = tensor.cast %55 : tensor<1x1x1x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 scf.yield %cast_33 : tensor<1x1x1x3xf32>
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %inserted_slice_26 = tensor.insert_slice %53 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x1x3xf32> into tensor<1x1x12x3xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_26 : tensor<1x1x12x3xf32>
+# CHECK-NEXT:             } {"./w"}
+# CHECK-NEXT:             %cast = tensor.cast %28 : tensor<1x1x12x3xf32> to tensor<1x1x12x3xf32>
+# CHECK-NEXT:             scf.yield %cast : tensor<1x1x12x3xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_16 = tensor.insert_slice %26 into %arg6[0, %arg5, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : tensor<1x1x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_16 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:         } {"./h"}
+# CHECK-NEXT:         scf.yield %11 : tensor<1x12x12x3xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %9 into %arg4[%arg3, 0, 0, 0] [1, 12, 12, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> into tensor<1x12x12x3xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x12x12x3xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     %cst_1 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_2 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_4 = arith.constant 1 : index
+# CHECK-NEXT:     %4 = scf.for %arg3 = %c0_2 to %c1_3 step %c1_4 iter_args(%arg4 = %3) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_9 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_8 to %c4 step %c1_9 iter_args(%arg6 = %extracted_slice) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_10 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_12 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg7 = %c0_11 to %c4_12 step %c1_13 iter_args(%arg8 = %extracted_slice_10) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %extracted_slice_15 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:           %8 = scf.for %arg9 = %c0_16 to %c16 step %c1_17 iter_args(%arg10 = %extracted_slice_15) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_19 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %9 = linalg.fill {__xtc_id_conv_0_} ins(%cst_1 : f32) outs(%extracted_slice_19 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %inserted_slice_20 = tensor.insert_slice %9 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_20 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_18 = tensor.insert_slice %8 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_18 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_14 = tensor.insert_slice %7 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_14 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c1_6 = arith.constant 1 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_5 to %c1_6 step %c1_7 iter_args(%arg4 = %4) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %2[%arg3, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : tensor<1x12x12x3xf32> to tensor<1x11x11x3xf32>
+# CHECK-NEXT:       %extracted_slice_8 = tensor.extract_slice %arg1[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x4x4x16xf32>
+# CHECK-NEXT:       %c0_10 = arith.constant 0 : index
+# CHECK-NEXT:       %c4 = arith.constant 4 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg5 = %c0_10 to %c4 step %c1_11 iter_args(%arg6 = %extracted_slice_9) -> (tensor<1x4x4x16xf32>) {
+# CHECK-NEXT:         %7 = affine.apply #map16(%arg5)
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %extracted_slice[0, %7, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : tensor<1x11x11x3xf32> to tensor<1x5x11x3xf32>
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %extracted_slice_8[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x1x4x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_16 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_17 = arith.constant 1 : index
+# CHECK-NEXT:         %8 = scf.for %arg7 = %c0_15 to %c4_16 step %c1_17 iter_args(%arg8 = %extracted_slice_14) -> (tensor<1x1x4x16xf32>) {
+# CHECK-NEXT:           %9 = affine.apply #map16(%arg7)
+# CHECK-NEXT:           %extracted_slice_19 = tensor.extract_slice %extracted_slice_12[0, 0, %9, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x11x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [5, 5, 3, 16] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x16xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> to tensor<1x1x1x16xf32>
+# CHECK-NEXT:           %c0_22 = arith.constant 0 : index
+# CHECK-NEXT:           %c16 = arith.constant 16 : index
+# CHECK-NEXT:           %c1_23 = arith.constant 1 : index
+# CHECK-NEXT:           %10 = scf.for %arg9 = %c0_22 to %c16 step %c1_23 iter_args(%arg10 = %extracted_slice_21) -> (tensor<1x1x1x16xf32>) {
+# CHECK-NEXT:             %extracted_slice_25 = tensor.extract_slice %extracted_slice_19[0, 0, 0, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x5x5x3xf32>
+# CHECK-NEXT:             %extracted_slice_26 = tensor.extract_slice %extracted_slice_20[0, 0, 0, %arg9] [5, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x16xf32> to tensor<5x5x3x1xf32>
+# CHECK-NEXT:             %extracted_slice_27 = tensor.extract_slice %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x16xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:             %c0_28 = arith.constant 0 : index
+# CHECK-NEXT:             %c5 = arith.constant 5 : index
+# CHECK-NEXT:             %c1_29 = arith.constant 1 : index
+# CHECK-NEXT:             %11 = scf.for %arg11 = %c0_28 to %c5 step %c1_29 iter_args(%arg12 = %extracted_slice_27) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:               %extracted_slice_31 = tensor.extract_slice %extracted_slice_25[0, %arg11, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x1x5x3xf32>
+# CHECK-NEXT:               %extracted_slice_32 = tensor.extract_slice %extracted_slice_26[%arg11, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : tensor<5x5x3x1xf32> to tensor<1x5x3x1xf32>
+# CHECK-NEXT:               %extracted_slice_33 = tensor.extract_slice %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:               %c0_34 = arith.constant 0 : index
+# CHECK-NEXT:               %c5_35 = arith.constant 5 : index
+# CHECK-NEXT:               %c1_36 = arith.constant 1 : index
+# CHECK-NEXT:               %12 = scf.for %arg13 = %c0_34 to %c5_35 step %c1_36 iter_args(%arg14 = %extracted_slice_33) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                 %extracted_slice_38 = tensor.extract_slice %extracted_slice_31[0, 0, %arg13, 0] [1, 1, 1, 3] [1, 1, 1, 1] : tensor<1x1x5x3xf32> to tensor<1x1x1x3xf32>
+# CHECK-NEXT:                 %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[0, %arg13, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : tensor<1x5x3x1xf32> to tensor<1x1x3x1xf32>
+# CHECK-NEXT:                 %extracted_slice_40 = tensor.extract_slice %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 %c0_41 = arith.constant 0 : index
+# CHECK-NEXT:                 %c3 = arith.constant 3 : index
+# CHECK-NEXT:                 %c1_42 = arith.constant 1 : index
+# CHECK-NEXT:                 %13 = scf.for %arg15 = %c0_41 to %c3 step %c1_42 iter_args(%arg16 = %extracted_slice_40) -> (tensor<1x1x1x1xf32>) {
+# CHECK-NEXT:                   %extracted_slice_44 = tensor.extract_slice %extracted_slice_38[0, 0, 0, %arg15] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x3xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_45 = tensor.extract_slice %extracted_slice_39[0, 0, %arg15, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x3x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %extracted_slice_46 = tensor.extract_slice %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> to tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %14 = linalg.generic {indexing_maps = [#map17, #map18, #map19], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%extracted_slice_44, %extracted_slice_45 : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%extracted_slice_46 : tensor<1x1x1x1xf32>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                   ^bb0(%in: f32, %in_48: f32, %out: f32):
+# CHECK-NEXT:                     %15 = arith.mulf %in, %in_48 : f32
+# CHECK-NEXT:                     %16 = arith.addf %out, %15 : f32
+# CHECK-NEXT:                     linalg.yield %16 : f32
+# CHECK-NEXT:                   } -> tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   %inserted_slice_47 = tensor.insert_slice %14 into %arg16[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                   scf.yield %inserted_slice_47 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 } {"./c"}
+# CHECK-NEXT:                 %inserted_slice_43 = tensor.insert_slice %13 into %arg14[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:                 scf.yield %inserted_slice_43 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:               } {"./s"}
+# CHECK-NEXT:               %inserted_slice_37 = tensor.insert_slice %12 into %arg12[0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x1xf32>
+# CHECK-NEXT:               scf.yield %inserted_slice_37 : tensor<1x1x1x1xf32>
+# CHECK-NEXT:             } {"./r"}
+# CHECK-NEXT:             %inserted_slice_30 = tensor.insert_slice %11 into %arg10[0, 0, 0, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x16xf32>
+# CHECK-NEXT:             scf.yield %inserted_slice_30 : tensor<1x1x1x16xf32>
+# CHECK-NEXT:           } {"./f"}
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %10 into %arg8[0, 0, %arg7, 0] [1, 1, 1, 16] [1, 1, 1, 1] : tensor<1x1x1x16xf32> into tensor<1x1x4x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1x4x16xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         %inserted_slice_18 = tensor.insert_slice %8 into %arg6[0, %arg5, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : tensor<1x1x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_18 : tensor<1x4x4x16xf32>
+# CHECK-NEXT:       } {"./h"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg4[%arg3, 0, 0, 0] [1, 4, 4, 16] [1, 1, 1, 1] : tensor<1x4x4x16xf32> into tensor<1x4x4x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./b"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg2 : (tensor<1x4x4x16xf32>, memref<1x4x4x16xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (-d0 + 2, 0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (0, d0 - 2)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (8, d0)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (-d0 + 8, -d1 + 1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map5 = affine_map<(d0) -> (-d0, 0)>
+# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d1, d0)>
+# CHECK-NEXT: #map7 = affine_map<(d0, d1, d2) -> (-d2 + 1, d0 - d1)>
+# CHECK-NEXT: #map8 = affine_map<(d0, d1) -> (-d0 - d1 + 1)>
+# CHECK-NEXT: #map9 = affine_map<(d0) -> (d0 * 2)>
+# CHECK-NEXT: #map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 2 + d4, d2 * 2 + d5, d6)>
+# CHECK-NEXT: #map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+# CHECK-NEXT: #map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_conv2d_nhwc_mini(%arg0: memref<1x8x8x3xf32> {llvm.noalias}, %arg1: memref<5x5x3x16xf32> {llvm.noalias}, %arg2: memref<1x4x4x16xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c5 = arith.constant 5 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c3 = arith.constant 3 : index
+# CHECK-NEXT:     %c12 = arith.constant 12 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloc = memref.alloc() {alignment = 256 : i64} : memref<1x12x12x3xf32>
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<1x1x12x3xf32>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c12 step %c1 iter_args(%arg4 = %alloc) -> (memref<1x12x12x3xf32>) {
+# CHECK-NEXT:       %3 = affine.max #map(%arg3)
+# CHECK-NEXT:       %4 = affine.max #map1(%arg3)
+# CHECK-NEXT:       %5 = affine.min #map2(%4)
+# CHECK-NEXT:       %6 = affine.min #map3(%5, %3)
+# CHECK-NEXT:       %7 = affine.max #map4(%6)
+# CHECK-NEXT:       %8 = arith.cmpi eq, %7, %c0 : index
+# CHECK-NEXT:       %9 = scf.if %8 -> (memref<1x1x12x3xf32>) {
+# CHECK-NEXT:         linalg.map outs(%alloca : memref<1x1x12x3xf32>)
+# CHECK-NEXT:           () {
+# CHECK-NEXT:             %10 = linalg.index 0 : index
+# CHECK-NEXT:             %11 = linalg.index 1 : index
+# CHECK-NEXT:             %12 = linalg.index 2 : index
+# CHECK-NEXT:             %13 = linalg.index 3 : index
+# CHECK-NEXT:             linalg.yield %cst : f32
+# CHECK-NEXT:           }
+# CHECK-NEXT:         scf.yield %alloca : memref<1x1x12x3xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg0[0, %5, 0, 0] [1, %7, 8, 3] [1, 1, 1, 1] : memref<1x8x8x3xf32> to memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_3, %alloca_0 : memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x12x3xf32>
+# CHECK-NEXT:         %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32>
+# CHECK-NEXT:         %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x3xf32>
+# CHECK-NEXT:         %10 = scf.for %arg5 = %c0 to %c12 step %c1 iter_args(%arg6 = %alloca_0) -> (memref<1x1x12x3xf32>) {
+# CHECK-NEXT:           %11 = affine.max #map5(%3)
+# CHECK-NEXT:           %12 = affine.min #map6(%11, %7)
+# CHECK-NEXT:           %13 = affine.min #map7(%7, %12, %3)
+# CHECK-NEXT:           %14 = affine.max #map4(%13)
+# CHECK-NEXT:           %15 = arith.cmpi eq, %14, %c0 : index
+# CHECK-NEXT:           %16 = affine.max #map(%arg5)
+# CHECK-NEXT:           %17 = affine.max #map1(%arg5)
+# CHECK-NEXT:           %18 = affine.min #map2(%17)
+# CHECK-NEXT:           %19 = affine.min #map3(%18, %16)
+# CHECK-NEXT:           %20 = affine.max #map4(%19)
+# CHECK-NEXT:           %21 = arith.cmpi eq, %20, %c0 : index
+# CHECK-NEXT:           %22 = arith.ori %21, %15 : i1
+# CHECK-NEXT:           %23 = scf.if %22 -> (memref<1x1x1x3xf32>) {
+# CHECK-NEXT:             linalg.map outs(%alloca_4 : memref<1x1x1x3xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %24 = linalg.index 0 : index
+# CHECK-NEXT:                 %25 = linalg.index 1 : index
+# CHECK-NEXT:                 %26 = linalg.index 2 : index
+# CHECK-NEXT:                 %27 = linalg.index 3 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             scf.yield %alloca_4 : memref<1x1x1x3xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %subview_7 = memref.subview %subview_2[0, %12, %18, 0] [1, %14, %20, 3] [1, 1, 1, 1] : memref<1x?x8x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_8 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             memref.copy %subview_8, %alloca_5 : memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32>
+# CHECK-NEXT:             %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32>
+# CHECK-NEXT:             %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1x1x1xf32>
+# CHECK-NEXT:             %24 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %alloca_5) -> (memref<1x1x1x3xf32>) {
+# CHECK-NEXT:               %25 = affine.min #map6(%11, %14)
+# CHECK-NEXT:               %26 = affine.min #map7(%14, %25, %3)
+# CHECK-NEXT:               %27 = affine.max #map4(%26)
+# CHECK-NEXT:               %28 = arith.cmpi eq, %27, %c0 : index
+# CHECK-NEXT:               %29 = affine.apply #map8(%3, %27)
+# CHECK-NEXT:               %30 = affine.max #map5(%16)
+# CHECK-NEXT:               %31 = affine.min #map6(%30, %20)
+# CHECK-NEXT:               %32 = affine.min #map7(%20, %31, %16)
+# CHECK-NEXT:               %33 = affine.max #map4(%32)
+# CHECK-NEXT:               %34 = arith.cmpi eq, %33, %c0 : index
+# CHECK-NEXT:               %35 = arith.ori %34, %28 : i1
+# CHECK-NEXT:               %36 = affine.apply #map8(%16, %33)
+# CHECK-NEXT:               %37 = scf.if %35 -> (memref<1x1x1x1xf32>) {
+# CHECK-NEXT:                 linalg.map outs(%alloca_9 : memref<1x1x1x1xf32>)
+# CHECK-NEXT:                   () {
+# CHECK-NEXT:                     %38 = linalg.index 0 : index
+# CHECK-NEXT:                     %39 = linalg.index 1 : index
+# CHECK-NEXT:                     %40 = linalg.index 2 : index
+# CHECK-NEXT:                     %41 = linalg.index 3 : index
+# CHECK-NEXT:                     linalg.yield %cst : f32
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                 scf.yield %alloca_9 : memref<1x1x1x1xf32>
+# CHECK-NEXT:               } else {
+# CHECK-NEXT:                 %subview_12 = memref.subview %subview_7[0, %25, %31, %arg7] [1, %27, %33, 1] [1, 1, 1, 1] : memref<1x?x?x3xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.map outs(%alloca_10 : memref<1x1x1x1xf32>)
+# CHECK-NEXT:                   () {
+# CHECK-NEXT:                     %38 = linalg.index 0 : index
+# CHECK-NEXT:                     %39 = linalg.index 1 : index
+# CHECK-NEXT:                     %40 = linalg.index 2 : index
+# CHECK-NEXT:                     %41 = linalg.index 3 : index
+# CHECK-NEXT:                     linalg.yield %cst : f32
+# CHECK-NEXT:                   }
+# CHECK-NEXT:                 %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:                 %dim = memref.dim %subview_12, %c1_13 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %c2 = arith.constant 2 : index
+# CHECK-NEXT:                 %dim_14 = memref.dim %subview_12, %c2 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_15 = memref.subview %alloca_10[0, %3, %16, 0] [1, %dim, %dim_14, 1] [1, 1, 1, 1] : memref<1x1x1x1xf32> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>>
+# CHECK-NEXT:                 memref.copy %subview_12, %subview_15 : memref<1x?x?x1xf32, strided<[192, 24, 3, 1], offset: ?>> to memref<1x?x?x1xf32, strided<[1, 1, 1, 1], offset: ?>>
+# CHECK-NEXT:                 scf.yield %alloca_10 : memref<1x1x1x1xf32>
+# CHECK-NEXT:               }
+# CHECK-NEXT:               %subview_11 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>>
+# CHECK-NEXT:               memref.copy %37, %subview_11 : memref<1x1x1x1xf32> to memref<1x1x1x1xf32, strided<[3, 3, 3, 1], offset: ?>>
+# CHECK-NEXT:               scf.yield %arg8 : memref<1x1x1x3xf32>
+# CHECK-NEXT:             } {"./c"}
+# CHECK-NEXT:             scf.yield %24 : memref<1x1x1x3xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x12x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %23, %subview_6 : memref<1x1x1x3xf32> to memref<1x1x1x3xf32, strided<[36, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg6 : memref<1x1x12x3xf32>
+# CHECK-NEXT:         } {"./w"}
+# CHECK-NEXT:         scf.yield %10 : memref<1x1x12x3xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 12, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %9, %subview_1 : memref<1x1x12x3xf32> to memref<1x1x12x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x12x12x3xf32>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %arg2) -> (memref<1x4x4x16xf32>) {
+# CHECK-NEXT:       %subview_1 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       %3 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_1) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_3 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %4 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_3) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.fill {__xtc_id_conv_0_} ins(%cst : f32) outs(%subview_5 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>)
+# CHECK-NEXT:           %subview_6 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %subview_5, %subview_6 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %4, %subview_4 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %3, %subview_2 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     %subview = memref.subview %0[0, 0, 0, 0] [1, 11, 11, 3] [1, 1, 1, 1] : memref<1x12x12x3xf32> to memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>>
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (memref<1x4x4x16xf32>) {
+# CHECK-NEXT:       %3 = affine.apply #map9(%arg3)
+# CHECK-NEXT:       %subview_1 = memref.subview %subview[0, %3, 0, 0] [1, 5, 11, 3] [1, 1, 1, 1] : memref<1x11x11x3xf32, strided<[432, 36, 3, 1]>> to memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:       %subview_2 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c4 step %c1 iter_args(%arg6 = %subview_2) -> (memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:         %5 = affine.apply #map9(%arg5)
+# CHECK-NEXT:         %subview_4 = memref.subview %subview_1[0, 0, %5, 0] [1, 5, 5, 3] [1, 1, 1, 1] : memref<1x5x11x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:         %subview_5 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         %6 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_5) -> (memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_7 = memref.subview %arg1[0, 0, 0, %arg7] [5, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x16xf32> to memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_8 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           %7 = scf.for %arg9 = %c0 to %c5 step %c1 iter_args(%arg10 = %subview_8) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:             %subview_10 = memref.subview %subview_4[0, %arg9, 0, 0] [1, 1, 5, 3] [1, 1, 1, 1] : memref<1x5x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:             %subview_11 = memref.subview %subview_7[%arg9, 0, 0, 0] [1, 5, 3, 1] [1, 1, 1, 1] : memref<5x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:             %8 = scf.for %arg11 = %c0 to %c5 step %c1 iter_args(%arg12 = %arg10) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:               %subview_12 = memref.subview %subview_10[0, 0, %arg11, 0] [1, 1, 1, 3] [1, 1, 1, 1] : memref<1x1x5x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:               %subview_13 = memref.subview %subview_11[0, %arg11, 0, 0] [1, 1, 3, 1] [1, 1, 1, 1] : memref<1x5x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:               %9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) {
+# CHECK-NEXT:                 %subview_14 = memref.subview %subview_12[0, 0, 0, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x3xf32, strided<[432, 36, 3, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>
+# CHECK-NEXT:                 %subview_15 = memref.subview %subview_13[0, 0, %arg13, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x3x1xf32, strided<[240, 48, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>
+# CHECK-NEXT:                 linalg.generic {indexing_maps = [#map10, #map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1xf32, strided<[432, 36, 3, 1], offset: ?>>, memref<1x1x1x1xf32, strided<[240, 48, 16, 1], offset: ?>>) outs(%arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>) attrs =  {__xtc_id_conv_} {
+# CHECK-NEXT:                 ^bb0(%in: f32, %in_16: f32, %out: f32):
+# CHECK-NEXT:                   %10 = arith.mulf %in, %in_16 : f32
+# CHECK-NEXT:                   %11 = arith.addf %out, %10 : f32
+# CHECK-NEXT:                   linalg.yield %11 : f32
+# CHECK-NEXT:                 }
+# CHECK-NEXT:                 scf.yield %arg14 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:               } {"./c"}
+# CHECK-NEXT:               scf.yield %9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:             } {"./s"}
+# CHECK-NEXT:             scf.yield %8 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           } {"./r"}
+# CHECK-NEXT:           %subview_9 = memref.subview %arg8[0, 0, 0, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %7, %subview_9 : memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x1xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./f"}
+# CHECK-NEXT:         %subview_6 = memref.subview %arg6[0, 0, %arg5, 0] [1, 1, 1, 16] [1, 1, 1, 1] : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %6, %subview_6 : memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x1x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./w"}
+# CHECK-NEXT:       %subview_3 = memref.subview %arg4[0, %arg3, 0, 0] [1, 1, 4, 16] [1, 1, 1, 1] : memref<1x4x4x16xf32> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_3 : memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>> to memref<1x1x4x16xf32, strided<[256, 64, 16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<1x4x4x16xf32>
+# CHECK-NEXT:     } {"./h"}
+# CHECK-NEXT:     memref.copy %2, %arg2 : memref<1x4x4x16xf32> to memref<1x4x4x16xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: pad_conv2d_nhwc_mini
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 1x8x8x3xfloat32
+# CHECK-NEXT:   - %1 : 5x5x3x16xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %3 : 1x4x4x16xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: pad2d(%0, padding={1: (2, 2), 2: (2, 2)}, constant_value=0) {name = 'pad'} : [1x8x8x3xfloat32] -> [1x12x12x3xfloat32]
+# CHECK-NEXT:   - %3: conv2d(%2, %1, stride=(2, 2)) {name = 'conv'} : [1x12x12x3xfloat32, 5x5x3x16xfloat32] -> [1x4x4x16xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
new file mode 100644
index 00000000..c5e42160
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_pad_matmul_unpad_mlir_tensor.py
@@ -0,0 +1,711 @@
+# RUN: python %s 2>&1 | filecheck %s
+# REQUIRES: module_mlir
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 14, 14, 14, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+
+with O.graph(name="pad_matmul_unpad") as gb:
+    p1 = O.pad(a, padding=(0, 2), name="A_pad")
+    p2 = O.pad(b, padding=(0, 2), name="B_pad")
+    m_pad = O.matmul(p1, p2, name="matmul_padded")
+    O.unpad(m_pad, padding=(0, 2), name="C")
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+sch = impl.get_scheduler(default_node="matmul_padded")
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="gen_pad_tuple_matmul_unpad_mlir",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %padded = tensor.pad %arg0 nofold low[0, 0] high[2, 2] {
+# CHECK-NEXT:     ^bb0(%arg3: index, %arg4: index):
+# CHECK-NEXT:       tensor.yield %cst : f32
+# CHECK-NEXT:     } {__xtc_id_A_pad_} : tensor<14x14xf32> to tensor<16x16xf32>
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %padded_1 = tensor.pad %arg1 nofold low[0, 0] high[2, 2] {
+# CHECK-NEXT:     ^bb0(%arg3: index, %arg4: index):
+# CHECK-NEXT:       tensor.yield %cst_0 : f32
+# CHECK-NEXT:     } {__xtc_id_B_pad_} : tensor<14x14xf32> to tensor<16x16xf32>
+# CHECK-NEXT:     %2 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_2 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %3 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_2 : f32) outs(%2 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %4 = linalg.matmul {__xtc_id_matmul_padded_} ins(%padded, %padded_1 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%3 : tensor<16x16xf32>) -> tensor<16x16xf32>
+# CHECK-NEXT:     %5 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %4[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_A_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_B_pad_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_matmul_padded_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %tiled_linalg_op_6 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./j" : !transform.any_op
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_matmul_padded_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %tiled_linalg_op_10 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./k" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } {__xtc_id_A_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_2 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst_0 : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } {__xtc_id_B_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %6 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_6 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_13 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_16 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_9 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:     %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
+# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_15 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_16 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
+# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_21 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_27 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_23 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %9 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (d0, 14)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 1)>
+# CHECK-NEXT: #map3 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (0, d0)>
+# CHECK-NEXT: #map5 = affine_map<(d0, d1) -> (d0 - d1)>
+# CHECK-NEXT: #map6 = affine_map<(d0, d1) -> (d0 - d1, 1)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: tensor<14x14xf32> {llvm.noalias}, %arg1: tensor<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %1) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg0[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst : f32
+# CHECK-NEXT:             } {__xtc_id_A_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %4 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %c0_1 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_2 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_3 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg3 = %c0_1 to %c16_2 step %c1_3 iter_args(%arg4 = %4) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %10 = affine.min #map(%arg3)
+# CHECK-NEXT:       %11 = affine.apply #map1(%10)
+# CHECK-NEXT:       %12 = affine.min #map2(%10)
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %13 = arith.cmpi eq, %12, %c0_11 : index
+# CHECK-NEXT:       %14 = affine.apply #map3(%12)
+# CHECK-NEXT:       %15 = affine.apply #map3(%12)
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c14 = arith.constant 14 : index
+# CHECK-NEXT:       %16 = arith.cmpi eq, %c14, %c0_12 : index
+# CHECK-NEXT:       %17 = arith.ori %16, %13 : i1
+# CHECK-NEXT:       %18 = scf.if %17 -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %generated = tensor.generate  {
+# CHECK-NEXT:         ^bb0(%arg5: index, %arg6: index):
+# CHECK-NEXT:           tensor.yield %cst_0 : f32
+# CHECK-NEXT:         } : tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %generated : tensor<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %extracted_slice_13 = tensor.extract_slice %arg1[%10, 0] [%12, 14] [1, 1] : tensor<14x14xf32> to tensor<?x14xf32>
+# CHECK-NEXT:         %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:         %19 = tensor.empty() : tensor<1x16xf32>
+# CHECK-NEXT:         %c0_15 = arith.constant 0 : index
+# CHECK-NEXT:         %c0_16 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_17 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %20 = scf.for %arg5 = %c0_16 to %c16_17 step %c1_18 iter_args(%arg6 = %19) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:           %c0_19 = arith.constant 0 : index
+# CHECK-NEXT:           %21 = affine.min #map4(%12)
+# CHECK-NEXT:           %22 = affine.apply #map5(%12, %21)
+# CHECK-NEXT:           %23 = affine.min #map6(%12, %21)
+# CHECK-NEXT:           %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:           %24 = arith.cmpi eq, %23, %c0_20 : index
+# CHECK-NEXT:           %25 = affine.apply #map3(%23)
+# CHECK-NEXT:           %26 = affine.apply #map3(%23)
+# CHECK-NEXT:           %27 = affine.min #map(%arg5)
+# CHECK-NEXT:           %28 = affine.apply #map1(%27)
+# CHECK-NEXT:           %29 = affine.min #map2(%27)
+# CHECK-NEXT:           %c0_21 = arith.constant 0 : index
+# CHECK-NEXT:           %30 = arith.cmpi eq, %29, %c0_21 : index
+# CHECK-NEXT:           %31 = arith.ori %30, %24 : i1
+# CHECK-NEXT:           %32 = affine.apply #map3(%29)
+# CHECK-NEXT:           %33 = affine.apply #map3(%29)
+# CHECK-NEXT:           %34 = scf.if %31 -> (tensor<1x1xf32>) {
+# CHECK-NEXT:             %generated = tensor.generate  {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } : tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %generated : tensor<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %extracted_slice_23 = tensor.extract_slice %extracted_slice_13[%21, %27] [%23, %29] [1, 1] : tensor<?x14xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %padded = tensor.pad %extracted_slice_23 nofold low[0, 0] high[%26, %33] {
+# CHECK-NEXT:             ^bb0(%arg7: index, %arg8: index):
+# CHECK-NEXT:               tensor.yield %cst_0 : f32
+# CHECK-NEXT:             } {__xtc_id_B_pad_} : tensor<?x?xf32> to tensor<?x?xf32>
+# CHECK-NEXT:             %cast_24 = tensor.cast %padded : tensor<?x?xf32> to tensor<1x1xf32>
+# CHECK-NEXT:             scf.yield %cast_24 : tensor<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %inserted_slice_22 = tensor.insert_slice %34 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_22 : tensor<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         %cast = tensor.cast %20 : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %cast : tensor<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %18 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %6 = tensor.empty() : tensor<16x16xf32>
+# CHECK-NEXT:     %cst_4 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_5 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_6 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_7 = arith.constant 1 : index
+# CHECK-NEXT:     %7 = scf.for %arg3 = %c0_5 to %c16_6 step %c1_7 iter_args(%arg4 = %6) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_12 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_13 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_12 to %c16_13 step %c1_14 iter_args(%arg6 = %extracted_slice_11) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %11 = linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst_4 : f32) outs(%extracted_slice_15 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_16 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_16 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_8 = arith.constant 0 : index
+# CHECK-NEXT:     %c16_9 = arith.constant 16 : index
+# CHECK-NEXT:     %c1_10 = arith.constant 1 : index
+# CHECK-NEXT:     %8 = scf.for %arg3 = %c0_8 to %c16_9 step %c1_10 iter_args(%arg4 = %7) -> (tensor<16x16xf32>) {
+# CHECK-NEXT:       %extracted_slice_11 = tensor.extract_slice %2[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %extracted_slice_12 = tensor.extract_slice %5[0, 0] [16, 16] [1, 1] : tensor<16x16xf32> to tensor<16x16xf32>
+# CHECK-NEXT:       %extracted_slice_13 = tensor.extract_slice %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<16x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:       %c0_14 = arith.constant 0 : index
+# CHECK-NEXT:       %c16_15 = arith.constant 16 : index
+# CHECK-NEXT:       %c1_16 = arith.constant 1 : index
+# CHECK-NEXT:       %10 = scf.for %arg5 = %c0_14 to %c16_15 step %c1_16 iter_args(%arg6 = %extracted_slice_13) -> (tensor<1x16xf32>) {
+# CHECK-NEXT:         %extracted_slice_17 = tensor.extract_slice %extracted_slice_11[0, 0] [1, 16] [1, 1] : tensor<1x16xf32> to tensor<1x16xf32>
+# CHECK-NEXT:         %extracted_slice_18 = tensor.extract_slice %extracted_slice_12[0, %arg5] [16, 1] [1, 1] : tensor<16x16xf32> to tensor<16x1xf32>
+# CHECK-NEXT:         %extracted_slice_19 = tensor.extract_slice %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_20 = arith.constant 0 : index
+# CHECK-NEXT:         %c16_21 = arith.constant 16 : index
+# CHECK-NEXT:         %c1_22 = arith.constant 1 : index
+# CHECK-NEXT:         %11 = scf.for %arg7 = %c0_20 to %c16_21 step %c1_22 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_24 = tensor.extract_slice %extracted_slice_17[0, %arg7] [1, 1] [1, 1] : tensor<1x16xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_25 = tensor.extract_slice %extracted_slice_18[%arg7, 0] [1, 1] [1, 1] : tensor<16x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_26 = tensor.extract_slice %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %12 = linalg.matmul {__xtc_id_matmul_padded_} ins(%extracted_slice_24, %extracted_slice_25 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_26 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_27 = tensor.insert_slice %12 into %arg8[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_27 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_23 = tensor.insert_slice %11 into %arg6[0, %arg5] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x16xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_23 : tensor<1x16xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %10 into %arg4[%arg3, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<16x16xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %9 = tensor.empty() : tensor<14x14xf32>
+# CHECK-NEXT:     %extracted_slice = tensor.extract_slice %8[0, 0] [14, 14] [1, 1] {__xtc_id_C_} : tensor<16x16xf32> to tensor<14x14xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %extracted_slice in restrict writable %arg2 : (tensor<14x14xf32>, memref<14x14xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: #map = affine_map<(d0) -> (14, d0)>
+# CHECK-NEXT: #map1 = affine_map<(d0) -> (-d0 + 14, 1)>
+# CHECK-NEXT: #map2 = affine_map<(d0) -> (-d0 + 14, 0, 1)>
+# CHECK-NEXT: #map3 = affine_map<(d0, d1) -> (1, d0 - d1)>
+# CHECK-NEXT: #map4 = affine_map<(d0) -> (-d0 + 1)>
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @pad_matmul_unpad(%arg0: memref<14x14xf32> {llvm.noalias}, %arg1: memref<14x14xf32> {llvm.noalias}, %arg2: memref<14x14xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c16 = arith.constant 16 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %alloca_0 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %alloca_1 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
+# CHECK-NEXT:     %alloca_2 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
+# CHECK-NEXT:     %0 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_0) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %4 = affine.min #map(%arg3)
+# CHECK-NEXT:       %5 = affine.min #map1(%4)
+# CHECK-NEXT:       %6 = arith.cmpi eq, %5, %c0 : index
+# CHECK-NEXT:       %7 = scf.if %6 -> (memref<1x16xf32>) {
+# CHECK-NEXT:         linalg.map outs(%alloca_1 : memref<1x16xf32>)
+# CHECK-NEXT:           () {
+# CHECK-NEXT:             %8 = linalg.index 0 : index
+# CHECK-NEXT:             %9 = linalg.index 1 : index
+# CHECK-NEXT:             linalg.yield %cst : f32
+# CHECK-NEXT:           }
+# CHECK-NEXT:         scf.yield %alloca_1 : memref<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %subview_7 = memref.subview %arg0[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref<?x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:         %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_8, %alloca_2 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32>
+# CHECK-NEXT:         %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_2) -> (memref<1x16xf32>) {
+# CHECK-NEXT:           %9 = affine.min #map2(%4)
+# CHECK-NEXT:           %10 = affine.min #map3(%5, %9)
+# CHECK-NEXT:           %11 = arith.cmpi eq, %10, %c0 : index
+# CHECK-NEXT:           %12 = affine.apply #map4(%10)
+# CHECK-NEXT:           %13 = affine.min #map(%arg5)
+# CHECK-NEXT:           %14 = affine.min #map1(%13)
+# CHECK-NEXT:           %15 = arith.cmpi eq, %14, %c0 : index
+# CHECK-NEXT:           %16 = arith.ori %15, %11 : i1
+# CHECK-NEXT:           %17 = affine.apply #map4(%14)
+# CHECK-NEXT:           %18 = scf.if %16 -> (memref<1x1xf32>) {
+# CHECK-NEXT:             linalg.map outs(%alloca_9 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             scf.yield %alloca_9 : memref<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref<?x14xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             linalg.map outs(%alloca_10 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:             %dim = memref.dim %subview_12, %c0_13 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:             %dim_15 = memref.dim %subview_12, %c1_14 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             memref.copy %subview_12, %subview_16 : memref<?x?xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             scf.yield %alloca_10 : memref<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg6 : memref<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         scf.yield %8 : memref<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %alloca_3 = memref.alloca() {alignment = 256 : i64} : memref<16x16xf32>
+# CHECK-NEXT:     %alloca_4 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
+# CHECK-NEXT:     %alloca_5 = memref.alloca() {alignment = 256 : i64} : memref<1x16xf32>
+# CHECK-NEXT:     %1 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca_3) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %4 = affine.min #map(%arg3)
+# CHECK-NEXT:       %5 = affine.min #map1(%4)
+# CHECK-NEXT:       %6 = arith.cmpi eq, %5, %c0 : index
+# CHECK-NEXT:       %7 = scf.if %6 -> (memref<1x16xf32>) {
+# CHECK-NEXT:         linalg.map outs(%alloca_4 : memref<1x16xf32>)
+# CHECK-NEXT:           () {
+# CHECK-NEXT:             %8 = linalg.index 0 : index
+# CHECK-NEXT:             %9 = linalg.index 1 : index
+# CHECK-NEXT:             linalg.yield %cst : f32
+# CHECK-NEXT:           }
+# CHECK-NEXT:         scf.yield %alloca_4 : memref<1x16xf32>
+# CHECK-NEXT:       } else {
+# CHECK-NEXT:         %subview_7 = memref.subview %arg1[%4, 0] [%5, 14] [1, 1] : memref<14x14xf32> to memref<?x14xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:         %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_8, %alloca_5 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32>
+# CHECK-NEXT:         %alloca_9 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %alloca_10 = memref.alloca() {alignment = 256 : i64} : memref<1x1xf32>
+# CHECK-NEXT:         %8 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %alloca_5) -> (memref<1x16xf32>) {
+# CHECK-NEXT:           %9 = affine.min #map2(%4)
+# CHECK-NEXT:           %10 = affine.min #map3(%5, %9)
+# CHECK-NEXT:           %11 = arith.cmpi eq, %10, %c0 : index
+# CHECK-NEXT:           %12 = affine.apply #map4(%10)
+# CHECK-NEXT:           %13 = affine.min #map(%arg5)
+# CHECK-NEXT:           %14 = affine.min #map1(%13)
+# CHECK-NEXT:           %15 = arith.cmpi eq, %14, %c0 : index
+# CHECK-NEXT:           %16 = arith.ori %15, %11 : i1
+# CHECK-NEXT:           %17 = affine.apply #map4(%14)
+# CHECK-NEXT:           %18 = scf.if %16 -> (memref<1x1xf32>) {
+# CHECK-NEXT:             linalg.map outs(%alloca_9 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             scf.yield %alloca_9 : memref<1x1xf32>
+# CHECK-NEXT:           } else {
+# CHECK-NEXT:             %subview_12 = memref.subview %subview_7[%9, %13] [%10, %14] [1, 1] : memref<?x14xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             linalg.map outs(%alloca_10 : memref<1x1xf32>)
+# CHECK-NEXT:               () {
+# CHECK-NEXT:                 %19 = linalg.index 0 : index
+# CHECK-NEXT:                 %20 = linalg.index 1 : index
+# CHECK-NEXT:                 linalg.yield %cst : f32
+# CHECK-NEXT:               }
+# CHECK-NEXT:             %c0_13 = arith.constant 0 : index
+# CHECK-NEXT:             %dim = memref.dim %subview_12, %c0_13 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %c1_14 = arith.constant 1 : index
+# CHECK-NEXT:             %dim_15 = memref.dim %subview_12, %c1_14 : memref<?x?xf32, strided<[14, 1], offset: ?>>
+# CHECK-NEXT:             %subview_16 = memref.subview %alloca_10[0, 0] [%dim, %dim_15] [1, 1] : memref<1x1xf32> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             memref.copy %subview_12, %subview_16 : memref<?x?xf32, strided<[14, 1], offset: ?>> to memref<?x?xf32, strided<[1, 1]>>
+# CHECK-NEXT:             scf.yield %alloca_10 : memref<1x1xf32>
+# CHECK-NEXT:           }
+# CHECK-NEXT:           %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           memref.copy %18, %subview_11 : memref<1x1xf32> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           scf.yield %arg6 : memref<1x16xf32>
+# CHECK-NEXT:         } {"./j"}
+# CHECK-NEXT:         scf.yield %8 : memref<1x16xf32>
+# CHECK-NEXT:       }
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %7, %subview_6 : memref<1x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %2 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %alloca) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_6 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_6) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_8 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_matmul_padded_0_} ins(%cst : f32) outs(%subview_8 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_9 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_8, %subview_9 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_7 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = scf.for %arg3 = %c0 to %c16 step %c1 iter_args(%arg4 = %2) -> (memref<16x16xf32>) {
+# CHECK-NEXT:       %subview_6 = memref.subview %0[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %subview_7 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %subview_7) -> (memref<1x16xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_9 = memref.subview %1[0, %arg5] [16, 1] [1, 1] : memref<16x16xf32> to memref<16x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %subview_10 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg7 = %c0 to %c16 step %c1 iter_args(%arg8 = %subview_10) -> (memref<1x1xf32, strided<[16, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_12 = memref.subview %subview_6[0, %arg7] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           %subview_13 = memref.subview %subview_9[%arg7, 0] [1, 1] [1, 1] : memref<16x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_matmul_padded_} ins(%subview_12, %subview_13 : memref<1x1xf32, strided<[16, 1], offset: ?>>, memref<1x1xf32, strided<[16, 1], offset: ?>>) outs(%arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>)
+# CHECK-NEXT:           scf.yield %arg8 : memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_11 = memref.subview %arg6[0, %arg5] [1, 1] [1, 1] : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_11 : memref<1x1xf32, strided<[16, 1], offset: ?>> to memref<1x1xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg6 : memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_8 = memref.subview %arg4[%arg3, 0] [1, 16] [1, 1] : memref<16x16xf32> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_8 : memref<1x16xf32, strided<[16, 1], offset: ?>> to memref<1x16xf32, strided<[16, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg4 : memref<16x16xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %subview = memref.subview %3[0, 0] [14, 14] [1, 1] : memref<16x16xf32> to memref<14x14xf32, strided<[16, 1]>>
+# CHECK-NEXT:     memref.copy %subview, %arg2 : memref<14x14xf32, strided<[16, 1]>> to memref<14x14xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: pad_matmul_unpad
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 14x14xfloat32
+# CHECK-NEXT:   - %1 : 14x14xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %5 : 14x14xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %2: pad(%0, padding=(0, 2), constant_value=0) {name = 'A_pad'} : [14x14xfloat32] -> [16x16xfloat32]
+# CHECK-NEXT:   - %3: pad(%1, padding=(0, 2), constant_value=0) {name = 'B_pad'} : [14x14xfloat32] -> [16x16xfloat32]
+# CHECK-NEXT:   - %4: matmul(%2, %3) {name = 'matmul_padded'} : [16x16xfloat32, 16x16xfloat32] -> [16x16xfloat32]
+# CHECK-NEXT:   - %5: unpad(%4, padding=(0, 2)) {name = 'C'} : [16x16xfloat32] -> [14x14xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0
diff --git a/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
new file mode 100644
index 00000000..ac4cee48
--- /dev/null
+++ b/tests/filecheck/backends/tensor_dialect/test_two_matmuls_mlir_tensor.py
@@ -0,0 +1,392 @@
+# RUN: python %s 2>&1 | filecheck %s
+# UNSUPPORTED: mlir-target=nvgpu
+
+import xtc.graphs.xtc.op as O
+from xtc.backends.mlir import Backend
+
+I, J, K, dtype = 4, 32, 512, "float32"
+a = O.tensor((I, K), dtype, name="A")
+b = O.tensor((K, J), dtype, name="B")
+c = O.tensor((J, I), dtype, name="C")
+
+with O.graph(name="matmul") as gb:
+    d = O.matmul(a, b, name="D")
+    O.matmul(c, d, name="E")
+
+graph = gb.graph
+print(graph)
+
+impl = Backend(graph, use_tensor_dialect=True)
+
+sch = impl.get_scheduler(default_node = "E")
+sched = sch.schedule()
+
+comp = impl.get_compiler(
+    shared_lib=True,
+    dump_file="two_matmul_mlir_tensor",
+    print_source_ir=True,
+    print_transformed_ir=True,
+    print_bufferization_ir=True,
+)
+module = comp.compile(sched)
+executor = module.get_executor(validate=True)
+res = executor.execute()
+print(f"CODE: {res}")
+
+# CHECK: // -----// IR Dump Before transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %1 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %2 = linalg.matmul {__xtc_id_D_} ins(%arg0, %arg1 : tensor<4x512xf32>, tensor<512x32xf32>) outs(%1 : tensor<4x32xf32>) -> tensor<4x32xf32>
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<32x32xf32>
+# CHECK-NEXT:     %cst_0 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %4 = linalg.fill {__xtc_id_E_0_} ins(%cst_0 : f32) outs(%3 : tensor<32x32xf32>) -> tensor<32x32xf32>
+# CHECK-NEXT:     %5 = linalg.matmul {__xtc_id_E_} ins(%arg2, %2 : tensor<32x4xf32>, tensor<4x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32>
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @_vecto(%arg0: !transform.any_op {transform.consumed}) {
+# CHECK-NEXT:     transform.structured.vectorize %arg0 : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT:   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+# CHECK-NEXT:     %0 = transform.structured.match attributes {__xtc_id_D_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_0, %loops_1 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_1 "./j" : !transform.any_op
+# CHECK-NEXT:     %1 = transform.structured.match attributes {__xtc_id_D_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_2, %loops_3 = transform.structured.tile_using_for %1 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_3 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_4, %loops_5 = transform.structured.tile_using_for %tiled_linalg_op_2 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_5 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_6, %loops_7 = transform.structured.tile_using_for %tiled_linalg_op_4 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_7 "./k" : !transform.any_op
+# CHECK-NEXT:     %2 = transform.structured.match attributes {__xtc_id_E_0_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_8, %loops_9 = transform.structured.tile_using_for %2 tile_sizes [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_9 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_10, %loops_11 = transform.structured.tile_using_for %tiled_linalg_op_8 tile_sizes [0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_11 "./j" : !transform.any_op
+# CHECK-NEXT:     %3 = transform.structured.match attributes {__xtc_id_E_} in %arg0 : (!transform.any_op) -> !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_12, %loops_13 = transform.structured.tile_using_for %3 tile_sizes [1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_13 "./i" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_14, %loops_15 = transform.structured.tile_using_for %tiled_linalg_op_12 tile_sizes [0, 1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_15 "./j" : !transform.any_op
+# CHECK-NEXT:     %tiled_linalg_op_16, %loops_17 = transform.structured.tile_using_for %tiled_linalg_op_14 tile_sizes [0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+# CHECK-NEXT:     transform.annotate %loops_17 "./k" : !transform.any_op
+# CHECK-NEXT:     transform.yield 
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After transform //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_23 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<32x32xf32>
+# CHECK-NEXT:     %cst_3 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:     %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:     %c32_7 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_18 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_20 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump Before Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: tensor<4x512xf32> {llvm.noalias}, %arg1: tensor<512x32xf32> {llvm.noalias}, %arg2: tensor<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %0 = tensor.empty() : tensor<4x32xf32>
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_0 = arith.constant 0 : index
+# CHECK-NEXT:     %c4_1 = arith.constant 4 : index
+# CHECK-NEXT:     %c1_2 = arith.constant 1 : index
+# CHECK-NEXT:     %2 = scf.for %arg4 = %c0_0 to %c4_1 step %c1_2 iter_args(%arg5 = %1) -> (tensor<4x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [1, 512] [1, 1] : tensor<4x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %arg1[0, 0] [512, 32] [1, 1] : tensor<512x32xf32> to tensor<512x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<4x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 512] [1, 1] : tensor<1x512xf32> to tensor<1x512xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [512, 1] [1, 1] : tensor<512x32xf32> to tensor<512x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c512 = arith.constant 512 : index
+# CHECK-NEXT:         %c1_18 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c512 step %c1_18 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_20 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x512xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<512x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_D_} ins(%extracted_slice_20, %extracted_slice_21 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_22 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_23 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_23 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_19 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_19 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<4x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = tensor.empty() : tensor<32x32xf32>
+# CHECK-NEXT:     %cst_3 = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %c0_4 = arith.constant 0 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_5 = arith.constant 1 : index
+# CHECK-NEXT:     %4 = scf.for %arg4 = %c0_4 to %c32 step %c1_5 iter_args(%arg5 = %3) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_9 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_10 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_11 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_9 to %c32_10 step %c1_11 iter_args(%arg7 = %extracted_slice) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_12 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %7 = linalg.fill {__xtc_id_E_0_} ins(%cst_3 : f32) outs(%extracted_slice_12 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:         %inserted_slice_13 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_13 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %c0_6 = arith.constant 0 : index
+# CHECK-NEXT:     %c32_7 = arith.constant 32 : index
+# CHECK-NEXT:     %c1_8 = arith.constant 1 : index
+# CHECK-NEXT:     %5 = scf.for %arg4 = %c0_6 to %c32_7 step %c1_8 iter_args(%arg5 = %4) -> (tensor<32x32xf32>) {
+# CHECK-NEXT:       %extracted_slice = tensor.extract_slice %arg2[%arg4, 0] [1, 4] [1, 1] : tensor<32x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:       %extracted_slice_9 = tensor.extract_slice %2[0, 0] [4, 32] [1, 1] : tensor<4x32xf32> to tensor<4x32xf32>
+# CHECK-NEXT:       %extracted_slice_10 = tensor.extract_slice %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<32x32xf32> to tensor<1x32xf32>
+# CHECK-NEXT:       %c0_11 = arith.constant 0 : index
+# CHECK-NEXT:       %c32_12 = arith.constant 32 : index
+# CHECK-NEXT:       %c1_13 = arith.constant 1 : index
+# CHECK-NEXT:       %6 = scf.for %arg6 = %c0_11 to %c32_12 step %c1_13 iter_args(%arg7 = %extracted_slice_10) -> (tensor<1x32xf32>) {
+# CHECK-NEXT:         %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf32> to tensor<1x4xf32>
+# CHECK-NEXT:         %extracted_slice_15 = tensor.extract_slice %extracted_slice_9[0, %arg6] [4, 1] [1, 1] : tensor<4x32xf32> to tensor<4x1xf32>
+# CHECK-NEXT:         %extracted_slice_16 = tensor.extract_slice %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x32xf32> to tensor<1x1xf32>
+# CHECK-NEXT:         %c0_17 = arith.constant 0 : index
+# CHECK-NEXT:         %c4_18 = arith.constant 4 : index
+# CHECK-NEXT:         %c1_19 = arith.constant 1 : index
+# CHECK-NEXT:         %7 = scf.for %arg8 = %c0_17 to %c4_18 step %c1_19 iter_args(%arg9 = %extracted_slice_16) -> (tensor<1x1xf32>) {
+# CHECK-NEXT:           %extracted_slice_21 = tensor.extract_slice %extracted_slice_14[0, %arg8] [1, 1] [1, 1] : tensor<1x4xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_22 = tensor.extract_slice %extracted_slice_15[%arg8, 0] [1, 1] [1, 1] : tensor<4x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %extracted_slice_23 = tensor.extract_slice %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
+# CHECK-NEXT:           %8 = linalg.matmul {__xtc_id_E_} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_23 : tensor<1x1xf32>) -> tensor<1x1xf32>
+# CHECK-NEXT:           %inserted_slice_24 = tensor.insert_slice %8 into %arg9[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x1xf32>
+# CHECK-NEXT:           scf.yield %inserted_slice_24 : tensor<1x1xf32>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %inserted_slice_20 = tensor.insert_slice %7 into %arg7[0, %arg6] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x32xf32>
+# CHECK-NEXT:         scf.yield %inserted_slice_20 : tensor<1x32xf32>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %inserted_slice = tensor.insert_slice %6 into %arg5[%arg4, 0] [1, 32] [1, 1] : tensor<1x32xf32> into tensor<32x32xf32>
+# CHECK-NEXT:       scf.yield %inserted_slice : tensor<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     bufferization.materialize_in_destination %5 in restrict writable %arg3 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: // -----// IR Dump After Tensor Lowering //----- //
+# CHECK-NEXT: module attributes {transform.with_named_sequence} {
+# CHECK-NEXT:   func.func @matmul(%arg0: memref<4x512xf32> {llvm.noalias}, %arg1: memref<512x32xf32> {llvm.noalias}, %arg2: memref<32x4xf32> {llvm.noalias}, %arg3: memref<32x32xf32> {llvm.noalias}) {
+# CHECK-NEXT:     %c512 = arith.constant 512 : index
+# CHECK-NEXT:     %c32 = arith.constant 32 : index
+# CHECK-NEXT:     %c1 = arith.constant 1 : index
+# CHECK-NEXT:     %c4 = arith.constant 4 : index
+# CHECK-NEXT:     %c0 = arith.constant 0 : index
+# CHECK-NEXT:     %cst = arith.constant 0.000000e+00 : f32
+# CHECK-NEXT:     %alloca = memref.alloca() {alignment = 256 : i64} : memref<4x32xf32>
+# CHECK-NEXT:     %0 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %alloca) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_D_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %1 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0) -> (memref<4x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg0[%arg4, 0] [1, 512] [1, 1] : memref<4x512xf32> to memref<1x512xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %arg1[0, %arg6] [512, 1] [1, 1] : memref<512x32xf32> to memref<512x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg8 = %c0 to %c512 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x512xf32, strided<[512, 1], offset: ?>> to memref<1x1xf32, strided<[512, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<512x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_D_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[512, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:           scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<4x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<4x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %2 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %arg3) -> (memref<32x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_1 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         linalg.fill {__xtc_id_E_0_} ins(%cst : f32) outs(%subview_1 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:         %subview_2 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %subview_1, %subview_2 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_0 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     %3 = scf.for %arg4 = %c0 to %c32 step %c1 iter_args(%arg5 = %2) -> (memref<32x32xf32>) {
+# CHECK-NEXT:       %subview = memref.subview %arg2[%arg4, 0] [1, 4] [1, 1] : memref<32x4xf32> to memref<1x4xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:       %subview_0 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       %4 = scf.for %arg6 = %c0 to %c32 step %c1 iter_args(%arg7 = %subview_0) -> (memref<1x32xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:         %subview_2 = memref.subview %1[0, %arg6] [4, 1] [1, 1] : memref<4x32xf32> to memref<4x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %subview_3 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         %5 = scf.for %arg8 = %c0 to %c4 step %c1 iter_args(%arg9 = %subview_3) -> (memref<1x1xf32, strided<[32, 1], offset: ?>>) {
+# CHECK-NEXT:           %subview_5 = memref.subview %subview[0, %arg8] [1, 1] [1, 1] : memref<1x4xf32, strided<[4, 1], offset: ?>> to memref<1x1xf32, strided<[4, 1], offset: ?>>
+# CHECK-NEXT:           %subview_6 = memref.subview %subview_2[%arg8, 0] [1, 1] [1, 1] : memref<4x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:           linalg.matmul {__xtc_id_E_} ins(%subview_5, %subview_6 : memref<1x1xf32, strided<[4, 1], offset: ?>>, memref<1x1xf32, strided<[32, 1], offset: ?>>) outs(%arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>)
+# CHECK-NEXT:           scf.yield %arg9 : memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         } {"./k"}
+# CHECK-NEXT:         %subview_4 = memref.subview %arg7[0, %arg6] [1, 1] [1, 1] : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         memref.copy %5, %subview_4 : memref<1x1xf32, strided<[32, 1], offset: ?>> to memref<1x1xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:         scf.yield %arg7 : memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       } {"./j"}
+# CHECK-NEXT:       %subview_1 = memref.subview %arg5[%arg4, 0] [1, 32] [1, 1] : memref<32x32xf32> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       memref.copy %4, %subview_1 : memref<1x32xf32, strided<[32, 1], offset: ?>> to memref<1x32xf32, strided<[32, 1], offset: ?>>
+# CHECK-NEXT:       scf.yield %arg5 : memref<32x32xf32>
+# CHECK-NEXT:     } {"./i"}
+# CHECK-NEXT:     memref.copy %3, %arg3 : memref<32x32xf32> to memref<32x32xf32>
+# CHECK-NEXT:     return
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+# CHECK-NEXT:  
+# CHECK-NEXT: graph:
+# CHECK-NEXT:   name: matmul
+# CHECK-NEXT:   inputs:
+# CHECK-NEXT:   - %0 : 4x512xfloat32
+# CHECK-NEXT:   - %1 : 512x32xfloat32
+# CHECK-NEXT:   - %2 : 32x4xfloat32
+# CHECK-NEXT:   outputs:
+# CHECK-NEXT:   - %4 : 32x32xfloat32
+# CHECK-NEXT:   nodes:
+# CHECK-NEXT:   - %3: matmul(%0, %1) {name = 'D'} : [4x512xfloat32, 512x32xfloat32] -> [4x32xfloat32]
+# CHECK-NEXT:   - %4: matmul(%2, %3) {name = 'E'} : [32x4xfloat32, 4x32xfloat32] -> [32x32xfloat32]
+# CHECK-NEXT:  
+# CHECK-NEXT: CODE: 0